This is an automated email from the ASF dual-hosted git repository.
krisden pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new b4f585d SOLR-16034: Enable spotless on clustering module
b4f585d is described below
commit b4f585d299a908b4243e0c7e297c4e3495752b84
Author: Kevin Risden <[email protected]>
AuthorDate: Sat Feb 19 11:00:26 2022 -0500
SOLR-16034: Enable spotless on clustering module
---
gradle/validation/spotless.gradle | 1 -
.../handler/clustering/ClusteringComponent.java | 308 ++++++++---------
.../org/apache/solr/handler/clustering/Engine.java | 123 ++++---
.../solr/handler/clustering/EngineContext.java | 47 ++-
.../solr/handler/clustering/EngineEntry.java | 30 +-
.../solr/handler/clustering/EngineParameters.java | 263 ++++++--------
.../handler/clustering/FlatKeysAttrVisitor.java | 168 +++++----
.../solr/handler/clustering/InputDocument.java | 14 +-
.../handler/clustering/PathResourceLookup.java | 7 +-
.../solr/handler/clustering/package-info.java | 16 +-
.../ClusteringComponentDistributedTest.java | 65 ++--
.../clustering/ClusteringComponentTest.java | 378 ++++++++++++---------
.../clustering/EchoClusteringAlgorithm.java | 38 +--
.../EchoClusteringAlgorithmProvider.java | 4 +-
.../clustering/MockClusteringAlgorithm.java | 58 ++--
.../handler/clustering/ResourceCheckAlgorithm.java | 45 +--
.../apache/solr/handler/clustering/SampleData.java | 246 +++++++-------
17 files changed, 909 insertions(+), 902 deletions(-)
diff --git a/gradle/validation/spotless.gradle
b/gradle/validation/spotless.gradle
index bc726e8..3c25cbc 100644
--- a/gradle/validation/spotless.gradle
+++ b/gradle/validation/spotless.gradle
@@ -44,7 +44,6 @@ configure(project(":solr").subprojects) { prj ->
// Exclude certain files (generated ones, mostly).
switch (project.path) {
- case ":solr:modules:clustering":
case ":solr:modules:extraction":
case ":solr:modules:gcs-repository":
case ":solr:modules:hadoop-auth":
diff --git
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/ClusteringComponent.java
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/ClusteringComponent.java
index de044c5..3d7b11e 100644
---
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/ClusteringComponent.java
+++
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/ClusteringComponent.java
@@ -16,6 +16,20 @@
*/
package org.apache.solr.handler.clustering;
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.stream.Collectors;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TotalHits;
@@ -46,78 +60,47 @@ import org.carrot2.clustering.Cluster;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.IOException;
-import java.lang.invoke.MethodHandles;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.LinkedHashMap;
-import java.util.LinkedHashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-import java.util.function.Function;
-import java.util.stream.Collectors;
-
/**
- * A {@link SearchComponent} for dynamic, unsupervised grouping of
- * search results based on the content of their text fields or contextual
- * snippets around query-matching regions.
+ * A {@link SearchComponent} for dynamic, unsupervised grouping of search
results based on the
+ * content of their text fields or contextual snippets around query-matching
regions.
*
- * <p>
- * The default implementation uses clustering algorithms from the
- * <a href="https://project.carrot2.org">Carrot<sup>2</sup> project</a>.
+ * <p>The default implementation uses clustering algorithms from the <a
+ * href="https://project.carrot2.org">Carrot<sup>2</sup> project</a>.
*
* @lucene.experimental
*/
public class ClusteringComponent extends SearchComponent implements
SolrCoreAware {
- /**
- * Default component name and parameter prefix.
- */
+ /** Default component name and parameter prefix. */
public static final String COMPONENT_NAME = "clustering";
/**
- * Request parameter that selects one of the {@link Engine} configurations
- * out of many possibly defined in the component's initialization parameters.
+ * Request parameter that selects one of the {@link Engine} configurations
out of many possibly
+ * defined in the component's initialization parameters.
*/
public static final String REQUEST_PARAM_ENGINE = COMPONENT_NAME + ".engine";
- /**
- * Engine configuration initialization block name.
- */
+ /** Engine configuration initialization block name. */
public static final String INIT_SECTION_ENGINE = "engine";
- /**
- * Response section name containing output clusters.
- */
+ /** Response section name containing output clusters. */
public static final String RESPONSE_SECTION_CLUSTERS = "clusters";
- /**
- * Default log sink.
- */
+ /** Default log sink. */
private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/**
- * An internal request parameter for shard requests used for collecting
- * input documents for clustering.
+ * An internal request parameter for shard requests used for collecting
input documents for
+ * clustering.
*/
private static final String REQUEST_PARAM_COLLECT_INPUTS = COMPONENT_NAME +
".collect-inputs";
- /**
- * Shard request response section name containing partial document inputs.
- */
+ /** Shard request response section name containing partial document inputs.
*/
private static final String RESPONSE_SECTION_INPUT_DOCUMENTS =
"clustering-inputs";
- /**
- * All engines declared in this component's initialization block.
- */
+ /** All engines declared in this component's initialization block. */
private final List<EngineEntry> declaredEngines = new ArrayList<>();
- /**
- * Declaration-order list of available search clustering engines.
- */
+ /** Declaration-order list of available search clustering engines. */
private final LinkedHashMap<String, EngineEntry> engines = new
LinkedHashMap<>();
private static boolean isComponentEnabled(ResponseBuilder rb) {
@@ -126,44 +109,46 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
private static List<InputDocument>
documentsFromNamedList(List<NamedList<Object>> docList) {
return docList.stream()
- .map(docProps -> {
- InputDocument doc = new InputDocument(
- docProps.get("id"),
- (String) docProps.get("language"));
-
- docProps.forEach((fieldName, value) -> {
- doc.addClusteredField(fieldName, (String) value);
- });
- doc.visitFields(docProps::add);
- return doc;
- })
+ .map(
+ docProps -> {
+ InputDocument doc =
+ new InputDocument(docProps.get("id"), (String)
docProps.get("language"));
+
+ docProps.forEach(
+ (fieldName, value) -> {
+ doc.addClusteredField(fieldName, (String) value);
+ });
+ doc.visitFields(docProps::add);
+ return doc;
+ })
.collect(Collectors.toList());
}
private static List<NamedList<Object>>
documentsToNamedList(List<InputDocument> documents) {
return documents.stream()
- .map(doc -> {
- NamedList<Object> docProps = new SimpleOrderedMap<>();
- docProps.add("id", doc.getId());
- docProps.add("language", doc.language());
- doc.visitFields(docProps::add);
- return docProps;
- })
+ .map(
+ doc -> {
+ NamedList<Object> docProps = new SimpleOrderedMap<>();
+ docProps.add("id", doc.getId());
+ docProps.add("language", doc.language());
+ doc.visitFields(docProps::add);
+ return docProps;
+ })
.collect(Collectors.toList());
}
- private static List<NamedList<Object>>
clustersToNamedList(List<InputDocument> documents,
-
List<Cluster<InputDocument>> clusters,
- EngineParameters
params) {
+ private static List<NamedList<Object>> clustersToNamedList(
+ List<InputDocument> documents,
+ List<Cluster<InputDocument>> clusters,
+ EngineParameters params) {
List<NamedList<Object>> result = new ArrayList<>();
clustersToNamedListRecursive(clusters, result, params);
if (params.includeOtherTopics()) {
LinkedHashSet<InputDocument> clustered = new LinkedHashSet<>();
clusters.forEach(cluster -> collectUniqueDocuments(cluster, clustered));
- List<InputDocument> unclustered = documents.stream()
- .filter(doc -> !clustered.contains(doc))
- .collect(Collectors.toList());
+ List<InputDocument> unclustered =
+ documents.stream().filter(doc ->
!clustered.contains(doc)).collect(Collectors.toList());
if (!unclustered.isEmpty()) {
NamedList<Object> cluster = new SimpleOrderedMap<>();
@@ -171,8 +156,9 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
cluster.add(ClusteringResponse.IS_OTHER_TOPICS, true);
cluster.add(ClusteringResponse.LABELS_NODE,
Collections.singletonList("Other topics"));
cluster.add(ClusteringResponse.SCORE_NODE, 0d);
- cluster.add(ClusteringResponse.DOCS_NODE,
unclustered.stream().map(InputDocument::getId)
- .collect(Collectors.toList()));
+ cluster.add(
+ ClusteringResponse.DOCS_NODE,
+
unclustered.stream().map(InputDocument::getId).collect(Collectors.toList()));
}
}
@@ -181,7 +167,8 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
private static void clustersToNamedListRecursive(
List<Cluster<InputDocument>> outputClusters,
- List<NamedList<Object>> parent, EngineParameters params) {
+ List<NamedList<Object>> parent,
+ EngineParameters params) {
for (Cluster<InputDocument> cluster : outputClusters) {
NamedList<Object> converted = new SimpleOrderedMap<>();
parent.add(converted);
@@ -206,8 +193,9 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
docs = new ArrayList<>(collectUniqueDocuments(cluster, new
LinkedHashSet<>()));
}
- converted.add(ClusteringResponse.DOCS_NODE,
docs.stream().map(InputDocument::getId)
- .collect(Collectors.toList()));
+ converted.add(
+ ClusteringResponse.DOCS_NODE,
+
docs.stream().map(InputDocument::getId).collect(Collectors.toList()));
if (params.includeSubclusters() && !cluster.getClusters().isEmpty()) {
List<NamedList<Object>> subclusters = new ArrayList<>();
@@ -217,7 +205,8 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
}
}
- private static LinkedHashSet<InputDocument>
collectUniqueDocuments(Cluster<InputDocument> cluster,
LinkedHashSet<InputDocument> unique) {
+ private static LinkedHashSet<InputDocument> collectUniqueDocuments(
+ Cluster<InputDocument> cluster, LinkedHashSet<InputDocument> unique) {
unique.addAll(cluster.getDocuments());
for (Cluster<InputDocument> sub : cluster.getClusters()) {
collectUniqueDocuments(sub, unique);
@@ -232,7 +221,8 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
if (args != null) {
for (Map.Entry<String, ?> entry : args) {
if (!INIT_SECTION_ENGINE.equals(entry.getKey())) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ throw new SolrException(
+ SolrException.ErrorCode.SERVER_ERROR,
"Unrecognized configuration entry: " + entry.getKey());
}
@@ -243,28 +233,35 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
@Override
public void inform(SolrCore core) {
- declaredEngines.forEach(engineEntry -> {
- if (!engineEntry.initialize(core)) {
- if (engineEntry.optional) {
- if (log.isInfoEnabled()) {
- log.info("Optional clustering engine is not available: {}",
engineEntry.engineName);
+ declaredEngines.forEach(
+ engineEntry -> {
+ if (!engineEntry.initialize(core)) {
+ if (engineEntry.optional) {
+ if (log.isInfoEnabled()) {
+ log.info("Optional clustering engine is not available: {}",
engineEntry.engineName);
+ }
+ } else {
+ throw new SolrException(
+ ErrorCode.SERVER_ERROR,
+ "A required clustering engine failed to initialize, check
the logs: "
+ + engineEntry.engineName);
+ }
+ } else {
+ if (engines.put(engineEntry.engineName, engineEntry) != null) {
+ throw new SolrException(
+ ErrorCode.SERVER_ERROR,
+ String.format(
+ Locale.ROOT,
+ "Duplicate clustering engine named '%s'.",
+ engineEntry.engineName));
+ }
}
- } else {
- throw new SolrException(ErrorCode.SERVER_ERROR,
- "A required clustering engine failed to initialize, check the
logs: " + engineEntry.engineName);
- }
- } else {
- if (engines.put(engineEntry.engineName, engineEntry) != null) {
- throw new SolrException(ErrorCode.SERVER_ERROR,
- String.format(Locale.ROOT,
- "Duplicate clustering engine named '%s'.",
engineEntry.engineName));
- }
- }
- });
+ });
if (engines.size() > 0) {
if (log.isInfoEnabled()) {
- log.info("The following clustering engines are available: {}",
+ log.info(
+ "The following clustering engines are available: {}",
String.join(", ", engines.keySet()));
}
} else {
@@ -294,8 +291,8 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
List<InputDocument> inputs = getDocuments(rb, parameters);
- if (rb.req.getParams().getBool(ShardParams.IS_SHARD, false) &&
- rb.req.getParams().getBool(REQUEST_PARAM_COLLECT_INPUTS, false)) {
+ if (rb.req.getParams().getBool(ShardParams.IS_SHARD, false)
+ && rb.req.getParams().getBool(REQUEST_PARAM_COLLECT_INPUTS, false)) {
rb.rsp.add(RESPONSE_SECTION_INPUT_DOCUMENTS,
documentsToNamedList(inputs));
} else {
doCluster(rb, engine, inputs, parameters);
@@ -309,8 +306,8 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
}
// Make sure the component is enabled for shard request.
- assert sreq.params.getBool(COMPONENT_NAME, false) :
- "Shard request should propagate clustering component enabled state?";
+ assert sreq.params.getBool(COMPONENT_NAME, false)
+ : "Shard request should propagate clustering component enabled state?";
// Piggyback collecting inputs for clustering on top of get fields request.
if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) != 0) {
@@ -331,13 +328,15 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
.flatMap(shardRequest -> shardRequest.responses.stream())
.filter(rsp -> rsp.getException() == null)
.map(rsp -> rsp.getSolrResponse().getResponse())
- .forEach(response -> {
- @SuppressWarnings("unchecked")
- List<NamedList<Object>> partialInputs = (List<NamedList<Object>>)
response.get(RESPONSE_SECTION_INPUT_DOCUMENTS);
- if (partialInputs != null) {
- inputs.addAll(documentsFromNamedList(partialInputs));
- }
- });
+ .forEach(
+ response -> {
+ @SuppressWarnings("unchecked")
+ List<NamedList<Object>> partialInputs =
+ (List<NamedList<Object>>)
response.get(RESPONSE_SECTION_INPUT_DOCUMENTS);
+ if (partialInputs != null) {
+ inputs.addAll(documentsFromNamedList(partialInputs));
+ }
+ });
EngineEntry engine = getEngine(rb);
EngineParameters parameters =
engine.defaults.derivedFrom(rb.req.getParams());
@@ -345,21 +344,21 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
}
}
- /**
- * Run clustering of input documents and append the result to the response.
- */
- private void doCluster(ResponseBuilder rb, EngineEntry engine,
List<InputDocument> inputs, EngineParameters parameters) {
+ /** Run clustering of input documents and append the result to the response.
*/
+ private void doCluster(
+ ResponseBuilder rb,
+ EngineEntry engine,
+ List<InputDocument> inputs,
+ EngineParameters parameters) {
// log.warn("# CLUSTERING: " + inputs.size() + " document(s), contents:\n
- "
// +
inputs.stream().map(Object::toString).collect(Collectors.joining("\n - ")));
List<Cluster<InputDocument>> clusters = engine.get().cluster(parameters,
rb.getQuery(), inputs);
rb.rsp.add(RESPONSE_SECTION_CLUSTERS, clustersToNamedList(inputs,
clusters, parameters));
}
- /**
- * Prepares input documents for clustering.
- */
- private List<InputDocument> getDocuments(ResponseBuilder responseBuilder,
- EngineParameters requestParameters)
throws IOException {
+ /** Prepares input documents for clustering. */
+ private List<InputDocument> getDocuments(
+ ResponseBuilder responseBuilder, EngineParameters requestParameters)
throws IOException {
SolrQueryRequest solrRequest = responseBuilder.req;
Query query = responseBuilder.getQuery();
@@ -385,12 +384,13 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
args.put(HighlightParams.FRAGSIZE, requestParameters.contextSize());
args.put(HighlightParams.SNIPPETS, requestParameters.contextCount());
// TODO highlight all docs at once instead of 1-by-1
- req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) {
- @Override
- public SolrIndexSearcher getSearcher() {
- return indexSearcher;
- }
- };
+ req =
+ new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) {
+ @Override
+ public SolrIndexSearcher getSearcher() {
+ return indexSearcher;
+ }
+ };
}
Map<String, Function<IndexableField, String>> fieldsToLoad = new
LinkedHashMap<>();
@@ -418,42 +418,44 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
Function<IndexableField, String> toString =
fieldsToLoad.get(fieldName);
if (toString != null) {
String value = toString.apply(indexableField);
- docFieldValues.compute(fieldName, (k, v) -> {
- if (v == null) {
- return value;
- } else {
- return v + " . " + value;
- }
- });
+ docFieldValues.compute(
+ fieldName,
+ (k, v) -> {
+ if (v == null) {
+ return value;
+ } else {
+ return v + " . " + value;
+ }
+ });
}
}
- InputDocument inputDocument = new InputDocument(
- docFieldValues.get(requestParameters.docIdField()),
- docLanguage.apply(docFieldValues));
+ InputDocument inputDocument =
+ new InputDocument(
+ docFieldValues.get(requestParameters.docIdField()),
+ docLanguage.apply(docFieldValues));
result.add(inputDocument);
Function<String, String> snippetProvider = (field) -> null;
if (preferQueryContext) {
- DocList docAsList = new DocSlice(0, 1,
- new int[]{docId},
- new float[]{1.0f},
- 1,
- 1.0f,
- TotalHits.Relation.EQUAL_TO);
-
- NamedList<?> highlights = highlighter.doHighlighting(docAsList, query,
req, fieldsToCluster);
+ DocList docAsList =
+ new DocSlice(
+ 0, 1, new int[] {docId}, new float[] {1.0f}, 1, 1.0f,
TotalHits.Relation.EQUAL_TO);
+
+ NamedList<?> highlights =
+ highlighter.doHighlighting(docAsList, query, req, fieldsToCluster);
if (highlights != null && highlights.size() == 1) {
@SuppressWarnings("unchecked")
NamedList<String[]> tmp = (NamedList<String[]>) highlights.getVal(0);
- snippetProvider = (field) -> {
- String[] values = tmp.get(field);
- if (values == null) {
- return null;
- } else {
- return String.join(" . ", Arrays.asList(values));
- }
- };
+ snippetProvider =
+ (field) -> {
+ String[] values = tmp.get(field);
+ if (values == null) {
+ return null;
+ } else {
+ return String.join(" . ", Arrays.asList(values));
+ }
+ };
}
}
@@ -475,8 +477,8 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
private EngineEntry getEngine(ResponseBuilder rb) {
if (engines.isEmpty()) {
- throw new SolrException(ErrorCode.SERVER_ERROR,
- "No clustering engines are defined or loaded.");
+ throw new SolrException(
+ ErrorCode.SERVER_ERROR, "No clustering engines are defined or
loaded.");
}
EngineEntry engine;
@@ -484,8 +486,8 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
if (name != null) {
engine = engines.get(name);
if (engine == null) {
- throw new SolrException(ErrorCode.SERVER_ERROR,
- "Clustering engine unknown or not loaded: " + name);
+ throw new SolrException(
+ ErrorCode.SERVER_ERROR, "Clustering engine unknown or not loaded:
" + name);
}
} else {
engine = engines.values().iterator().next();
@@ -493,9 +495,7 @@ public class ClusteringComponent extends SearchComponent
implements SolrCoreAwar
return engine;
}
- /**
- * @return A map of initialized clustering engines, exposed for tests only.
- */
+ /** @return A map of initialized clustering engines, exposed for tests only.
*/
Set<String> getEngineNames() {
return engines.keySet();
}
diff --git
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/Engine.java
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/Engine.java
index 79cfd9e..99763df 100644
---
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/Engine.java
+++
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/Engine.java
@@ -16,6 +16,16 @@
*/
package org.apache.solr.handler.clustering;
+import java.lang.invoke.MethodHandles;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
@@ -28,17 +38,6 @@ import org.carrot2.language.LanguageComponents;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.lang.invoke.MethodHandles;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.LinkedHashMap;
-import java.util.LinkedHashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-import java.util.stream.Collectors;
-
/**
* Search results clustering engine based on Carrot2 clustering algorithms.
*
@@ -48,9 +47,7 @@ import java.util.stream.Collectors;
final class Engine {
private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
- /**
- * All resources required for the clustering engine.
- */
+ /** All resources required for the clustering engine. */
private EngineContext engineContext;
boolean init(String engineName, SolrCore core, EngineParameters
defaultParams) {
@@ -59,24 +56,30 @@ final class Engine {
this.engineContext = new EngineContext(defaultParams.resources(), core);
{
- ClusteringAlgorithm defaultAlgorithm =
engineContext.getAlgorithm(defaultParams.algorithmName());
+ ClusteringAlgorithm defaultAlgorithm =
+ engineContext.getAlgorithm(defaultParams.algorithmName());
LanguageComponents defaultLanguage =
engineContext.getLanguage(defaultParams.language());
if (defaultAlgorithm == null) {
- log.warn("The default clustering algorithm for engine '{}' is not
available: {}",
- engineName, defaultParams.algorithmName());
+ log.warn(
+ "The default clustering algorithm for engine '{}' is not
available: {}",
+ engineName,
+ defaultParams.algorithmName());
}
if (defaultLanguage == null) {
- log.warn("The default language for engine {} is not available: {}",
- engineName, defaultParams.language());
+ log.warn(
+ "The default language for engine {} is not available: {}",
+ engineName,
+ defaultParams.language());
}
return (defaultAlgorithm != null && defaultLanguage != null);
}
}
- List<Cluster<InputDocument>> cluster(EngineParameters parameters, Query
query, List<InputDocument> documents) {
+ List<Cluster<InputDocument>> cluster(
+ EngineParameters parameters, Query query, List<InputDocument> documents)
{
try {
checkParameters(parameters);
@@ -107,7 +110,8 @@ final class Engine {
if (warnOnce.add(lang)) {
log.warn(
"Language '{}' is not supported, documents in this "
- + "language will not be clustered.", lang);
+ + "language will not be clustered.",
+ lang);
}
} else {
LanguageComponents langComponents = engineContext.getLanguage(lang);
@@ -115,11 +119,12 @@ final class Engine {
if (warnOnce.add(lang)) {
log.warn(
"Language '{}' is not supported by algorithm '{}', documents
in this "
- + "language will not be clustered.", lang,
parameters.algorithmName());
+ + "language will not be clustered.",
+ lang,
+ parameters.algorithmName());
}
} else {
- clustersByLanguage.put(
- lang, algorithm.cluster(e.getValue().stream(),
langComponents));
+ clustersByLanguage.put(lang,
algorithm.cluster(e.getValue().stream(), langComponents));
}
}
}
@@ -128,14 +133,16 @@ final class Engine {
if (clustersByLanguage.size() == 1) {
clusters = clustersByLanguage.values().iterator().next();
} else {
- clusters = clustersByLanguage.entrySet().stream()
- .map(e -> {
- Cluster<InputDocument> cluster = new Cluster<>();
- cluster.addLabel(e.getKey());
- e.getValue().forEach(cluster::addCluster);
- return cluster;
- })
- .collect(Collectors.toList());
+ clusters =
+ clustersByLanguage.entrySet().stream()
+ .map(
+ e -> {
+ Cluster<InputDocument> cluster = new Cluster<>();
+ cluster.addLabel(e.getKey());
+ e.getValue().forEach(cluster::addCluster);
+ return cluster;
+ })
+ .collect(Collectors.toList());
}
return clusters;
@@ -145,19 +152,21 @@ final class Engine {
}
}
- private void populateAlgorithmParameters(Query query, EngineParameters
requestParameters, ClusteringAlgorithm algorithm) {
+ private void populateAlgorithmParameters(
+ Query query, EngineParameters requestParameters, ClusteringAlgorithm
algorithm) {
LinkedHashMap<String, String> attrs = requestParameters.otherParameters();
// Set the optional query hint. We extract just the terms
if (!attrs.containsKey("queryHint")) {
Set<String> termSet = new LinkedHashSet<>();
- query.visit(new QueryVisitor() {
- @Override
- public void consumeTerms(Query query, Term... terms) {
- for (Term t : terms) {
- termSet.add(t.text());
- }
- }
- });
+ query.visit(
+ new QueryVisitor() {
+ @Override
+ public void consumeTerms(Query query, Term... terms) {
+ for (Term t : terms) {
+ termSet.add(t.text());
+ }
+ }
+ });
attrs.put("queryHint", String.join(" ", termSet));
}
algorithm.accept(new FlatKeysAttrVisitor(attrs));
@@ -166,30 +175,36 @@ final class Engine {
private void checkParameters(EngineParameters parameters) {
ClusteringAlgorithm algorithm =
engineContext.getAlgorithm(parameters.algorithmName());
if (algorithm == null) {
- throw new SolrException(ErrorCode.BAD_REQUEST, String.format(Locale.ROOT,
- "Algorithm '%s' not found.",
- parameters.algorithmName()));
+ throw new SolrException(
+ ErrorCode.BAD_REQUEST,
+ String.format(Locale.ROOT, "Algorithm '%s' not found.",
parameters.algorithmName()));
}
String defaultLanguage = parameters.language();
LanguageComponents languageComponents =
engineContext.getLanguage(defaultLanguage);
if (languageComponents == null) {
- throw new SolrException(ErrorCode.BAD_REQUEST, String.format(Locale.ROOT,
- "Language '%s' is not supported.",
- defaultLanguage));
+ throw new SolrException(
+ ErrorCode.BAD_REQUEST,
+ String.format(Locale.ROOT, "Language '%s' is not supported.",
defaultLanguage));
}
if (!algorithm.supports(languageComponents)) {
- throw new SolrException(ErrorCode.BAD_REQUEST, String.format(Locale.ROOT,
- "Language '%s' is not supported by algorithm '%s'.",
- defaultLanguage,
- parameters.algorithmName()));
+ throw new SolrException(
+ ErrorCode.BAD_REQUEST,
+ String.format(
+ Locale.ROOT,
+ "Language '%s' is not supported by algorithm '%s'.",
+ defaultLanguage,
+ parameters.algorithmName()));
}
if (parameters.fields().isEmpty()) {
- throw new SolrException(ErrorCode.BAD_REQUEST, String.format(Locale.ROOT,
- "At least one field name specifying content for clustering is
required in parameter '%s'.",
- EngineParameters.PARAM_FIELDS));
+ throw new SolrException(
+ ErrorCode.BAD_REQUEST,
+ String.format(
+ Locale.ROOT,
+ "At least one field name specifying content for clustering is
required in parameter '%s'.",
+ EngineParameters.PARAM_FIELDS));
}
}
}
diff --git
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/EngineContext.java
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/EngineContext.java
index 4ad7792..c2789da 100644
---
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/EngineContext.java
+++
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/EngineContext.java
@@ -16,19 +16,6 @@
*/
package org.apache.solr.handler.clustering;
-import org.apache.solr.core.SolrCore;
-import org.carrot2.clustering.ClusteringAlgorithm;
-import org.carrot2.clustering.ClusteringAlgorithmProvider;
-import org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm;
-import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
-import org.carrot2.clustering.stc.STCClusteringAlgorithm;
-import org.carrot2.language.LanguageComponents;
-import org.carrot2.language.LanguageComponentsLoader;
-import org.carrot2.language.LoadedLanguages;
-import org.carrot2.util.ChainedResourceLookup;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import java.io.IOException;
import java.io.UncheckedIOException;
import java.lang.invoke.MethodHandles;
@@ -44,24 +31,35 @@ import java.util.Optional;
import java.util.ServiceLoader;
import java.util.function.Supplier;
import java.util.stream.Collectors;
+import org.apache.solr.core.SolrCore;
+import org.carrot2.clustering.ClusteringAlgorithm;
+import org.carrot2.clustering.ClusteringAlgorithmProvider;
+import org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm;
+import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
+import org.carrot2.clustering.stc.STCClusteringAlgorithm;
+import org.carrot2.language.LanguageComponents;
+import org.carrot2.language.LanguageComponentsLoader;
+import org.carrot2.language.LoadedLanguages;
+import org.carrot2.util.ChainedResourceLookup;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
-/**
- * Clustering engine context: algorithms, preloaded language
- * resources and initial validation.
- */
+/** Clustering engine context: algorithms, preloaded language resources and
initial validation. */
final class EngineContext {
private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final LinkedHashMap<String, LanguageComponents> languages;
private final Map<String, ClusteringAlgorithmProvider> algorithmProviders;
- private final static Map<String, String> aliasedNames;
+ private static final Map<String, String> aliasedNames;
static {
aliasedNames = new HashMap<>();
aliasedNames.put(LingoClusteringAlgorithm.class.getName(),
LingoClusteringAlgorithm.NAME);
aliasedNames.put(STCClusteringAlgorithm.class.getName(),
STCClusteringAlgorithm.NAME);
- aliasedNames.put(BisectingKMeansClusteringAlgorithm.class.getName(),
BisectingKMeansClusteringAlgorithm.NAME);
+ aliasedNames.put(
+ BisectingKMeansClusteringAlgorithm.class.getName(),
+ BisectingKMeansClusteringAlgorithm.NAME);
}
EngineContext(String resourcesPath, SolrCore core) {
@@ -76,8 +74,7 @@ final class EngineContext {
}
if (!resourceLocations.isEmpty()) {
- log.info(
- "Clustering algorithm resources first looked up relative to: {}",
resourceLocations);
+ log.info("Clustering algorithm resources first looked up relative to:
{}", resourceLocations);
loader.withResourceLookup(
(provider) ->
@@ -91,8 +88,7 @@ final class EngineContext {
ClassLoader classLoader = getClass().getClassLoader();
algorithmProviders =
- ServiceLoader.load(ClusteringAlgorithmProvider.class, classLoader)
- .stream()
+ ServiceLoader.load(ClusteringAlgorithmProvider.class,
classLoader).stream()
.map(ServiceLoader.Provider::get)
.collect(Collectors.toMap(ClusteringAlgorithmProvider::name, e ->
e));
@@ -145,8 +141,7 @@ final class EngineContext {
}
ClusteringAlgorithm getAlgorithm(String algorithmName) {
- if (!algorithmProviders.containsKey(algorithmName)
- && aliasedNames.containsKey(algorithmName)) {
+ if (!algorithmProviders.containsKey(algorithmName) &&
aliasedNames.containsKey(algorithmName)) {
algorithmName = aliasedNames.get(algorithmName);
}
@@ -173,4 +168,4 @@ final class EngineContext {
return true;
}
}
-}
\ No newline at end of file
+}
diff --git
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/EngineEntry.java
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/EngineEntry.java
index 6a3191e..29d1f82 100644
---
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/EngineEntry.java
+++
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/EngineEntry.java
@@ -16,40 +16,28 @@
*/
package org.apache.solr.handler.clustering;
+import java.util.function.Supplier;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.SchemaField;
-import java.util.function.Supplier;
-
-/**
- * Parses each clustering engine configuration
- * initialization parameters.
- */
+/** Parses each clustering engine configuration initialization parameters. */
final class EngineEntry implements Supplier<Engine> {
- /**
- * Marks the engine as optional (if unavailable).
- */
+ /** Marks the engine as optional (if unavailable). */
private static final String PARAM_OPTIONAL = "optional";
- /**
- * Unique engine name parameter.
- */
+ /** Unique engine name parameter. */
private static final String PARAM_NAME = "name";
final boolean optional;
final String engineName;
final EngineParameters defaults;
- /**
- * Preinitialized instance of a clustering engine.
- */
+ /** Preinitialized instance of a clustering engine. */
private Engine engine;
- /**
- * {@code true} if the engine has been initialized properly and is available.
- */
+ /** {@code true} if the engine has been initialized properly and is
available. */
private boolean available;
EngineEntry(SolrParams params) {
@@ -62,8 +50,10 @@ final class EngineEntry implements Supplier<Engine> {
boolean initialize(SolrCore core) {
SchemaField uniqueField = core.getLatestSchema().getUniqueKeyField();
if (uniqueField == null) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
- ClusteringComponent.class.getSimpleName() + " requires the
declaration of uniqueKeyField in the schema.");
+ throw new SolrException(
+ SolrException.ErrorCode.SERVER_ERROR,
+ ClusteringComponent.class.getSimpleName()
+ + " requires the declaration of uniqueKeyField in the schema.");
}
String docIdField = uniqueField.getName();
defaults.setDocIdField(docIdField);
diff --git
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/EngineParameters.java
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/EngineParameters.java
index 5169343..18d72b5 100644
---
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/EngineParameters.java
+++
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/EngineParameters.java
@@ -16,144 +16,95 @@
*/
package org.apache.solr.handler.clustering;
-import org.apache.solr.common.params.SolrParams;
-
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Objects;
import java.util.Set;
+import org.apache.solr.common.params.SolrParams;
/**
- * {@link Engine} configuration parameters (and other parameters that
- * may tweak clustering algorithms on a per-request basis).
+ * {@link Engine} configuration parameters (and other parameters that may
tweak clustering
+ * algorithms on a per-request basis).
*
* @lucene.experimental
*/
public final class EngineParameters implements Cloneable {
- /**
- * Common prefix for configuration of engine settings.
- */
+ /** Common prefix for configuration of engine settings. */
private static final String PARAM_PREFIX = "clustering.";
- /**
- * @see #algorithmName()
- */
+ /** @see #algorithmName() */
public static final String PARAM_ALGORITHM = PARAM_PREFIX + "algorithm";
- /**
- * @see #maxLabels()
- */
+ /** @see #maxLabels() */
public static final String PARAM_MAX_LABELS = PARAM_PREFIX + "maxLabels";
- /**
- * @see #includeSubclusters()
- */
+ /** @see #includeSubclusters() */
public static final String PARAM_INCLUDE_SUBCLUSTERS = PARAM_PREFIX +
"includeSubclusters";
- /**
- * @see #includeOtherTopics()
- */
+ /** @see #includeOtherTopics() */
public static final String PARAM_INCLUDE_OTHER_TOPICS = PARAM_PREFIX +
"includeOtherTopics";
- /**
- * @see #language()
- */
+ /** @see #language() */
public static final String PARAM_LANGUAGE = PARAM_PREFIX + "language";
- /**
- * @see #languageField()
- */
+ /** @see #languageField() */
public static final String PARAM_LANGUAGE_FIELD = PARAM_PREFIX +
"languageField";
- /**
- * @see #resources()
- */
+ /** @see #resources() */
public static final String PARAM_RESOURCES = PARAM_PREFIX + "resources";
- /**
- * @see #fields()
- */
+ /** @see #fields() */
public static final String PARAM_FIELDS = PARAM_PREFIX + "fields";
- /**
- * @see #preferQueryContext()
- */
+ /** @see #preferQueryContext() */
public static final String PARAM_PREFER_QUERY_CONTEXT = PARAM_PREFIX +
"preferQueryContext";
- /**
- * @see #contextSize()
- */
+ /** @see #contextSize() */
public static final String PARAM_CONTEXT_SIZE = PARAM_PREFIX + "contextSize";
- /**
- * @see #contextCount()
- */
+ /** @see #contextCount() */
public static final String PARAM_CONTEXT_COUNT = PARAM_PREFIX +
"contextCount";
- /**
- * @see #PARAM_MAX_LABELS
- */
+ /** @see #PARAM_MAX_LABELS */
private int maxLabels = Integer.MAX_VALUE;
- /**
- * @see #PARAM_INCLUDE_SUBCLUSTERS
- */
+ /** @see #PARAM_INCLUDE_SUBCLUSTERS */
private boolean includeSubclusters = true;
- /**
- * @see #PARAM_INCLUDE_OTHER_TOPICS
- */
+ /** @see #PARAM_INCLUDE_OTHER_TOPICS */
private boolean includeOtherTopics = true;
- /**
- * @see #PARAM_ALGORITHM
- */
+ /** @see #PARAM_ALGORITHM */
private String algorithmName;
- /**
- * @see #PARAM_RESOURCES
- */
+ /** @see #PARAM_RESOURCES */
private String resources;
- /**
- * @see #PARAM_LANGUAGE
- */
+ /** @see #PARAM_LANGUAGE */
private String language = "English";
- /**
- * @see #PARAM_LANGUAGE_FIELD
- */
+ /** @see #PARAM_LANGUAGE_FIELD */
private String languageField;
- /**
- * @see #PARAM_PREFER_QUERY_CONTEXT
- */
+ /** @see #PARAM_PREFER_QUERY_CONTEXT */
private boolean preferQueryContext;
- /**
- * @see #PARAM_CONTEXT_SIZE
- */
+ /** @see #PARAM_CONTEXT_SIZE */
private int contextSize = 80 * 4;
- /**
- * @see #PARAM_CONTEXT_COUNT
- */
+ /** @see #PARAM_CONTEXT_COUNT */
private int contextCount = 3;
- /**
- * @see #PARAM_FIELDS
- */
+ /** @see #PARAM_FIELDS */
private LinkedHashSet<String> fields = new LinkedHashSet<>();
- /**
- * Non-engine configuration parameters (algorithm parameters).
- */
+ /** Non-engine configuration parameters (algorithm parameters). */
private LinkedHashMap<String, String> otherParameters = new
LinkedHashMap<>();
/**
- * Unique-value document identifier field. This is required for clustering
since clusters
- * only reference documents by their ID field's value.
+ * Unique-value document identifier field. This is required for clustering
since clusters only
+ * reference documents by their ID field's value.
*/
private String docIdField;
@@ -161,114 +112,109 @@ public final class EngineParameters implements
Cloneable {
extractFrom(params);
}
- /**
- * Extract parameter values from the given {@link SolrParams}.
- */
+ /** Extract parameter values from the given {@link SolrParams}. */
private EngineParameters extractFrom(SolrParams params) {
- params.stream().forEachOrdered(e -> {
- switch (e.getKey()) {
- case PARAM_MAX_LABELS:
- maxLabels = params.getInt(PARAM_MAX_LABELS);
- break;
- case PARAM_INCLUDE_SUBCLUSTERS:
- includeSubclusters = params.getBool(PARAM_INCLUDE_SUBCLUSTERS);
- break;
- case PARAM_INCLUDE_OTHER_TOPICS:
- includeOtherTopics = params.getBool(PARAM_INCLUDE_OTHER_TOPICS);
- break;
- case PARAM_ALGORITHM:
- algorithmName = params.get(PARAM_ALGORITHM);
- break;
- case PARAM_RESOURCES:
- resources = params.get(PARAM_RESOURCES);
- break;
- case PARAM_LANGUAGE:
- language = params.get(PARAM_LANGUAGE);
- break;
- case PARAM_LANGUAGE_FIELD:
- languageField = params.get(PARAM_LANGUAGE_FIELD);
- break;
- case PARAM_PREFER_QUERY_CONTEXT:
- preferQueryContext = params.getBool(PARAM_PREFER_QUERY_CONTEXT);
- break;
- case PARAM_CONTEXT_COUNT:
- contextCount = params.getPrimitiveInt(PARAM_CONTEXT_COUNT);
- break;
- case PARAM_CONTEXT_SIZE:
- contextSize = params.getPrimitiveInt(PARAM_CONTEXT_SIZE);
- break;
- case PARAM_FIELDS:
-
fields.addAll(Arrays.asList(params.get(PARAM_FIELDS).split("[,]\\s*")));
- break;
- default:
- // Unrecognized parameter. Preserve it.
- String[] value = e.getValue();
- if (value != null) {
- if (value.length == 1) {
- otherParameters.put(e.getKey(), value[0]);
- } else {
- otherParameters.put(e.getKey(), String.join(", ", value));
- }
- }
- break;
- }
- });
+ params.stream()
+ .forEachOrdered(
+ e -> {
+ switch (e.getKey()) {
+ case PARAM_MAX_LABELS:
+ maxLabels = params.getInt(PARAM_MAX_LABELS);
+ break;
+ case PARAM_INCLUDE_SUBCLUSTERS:
+ includeSubclusters =
params.getBool(PARAM_INCLUDE_SUBCLUSTERS);
+ break;
+ case PARAM_INCLUDE_OTHER_TOPICS:
+ includeOtherTopics =
params.getBool(PARAM_INCLUDE_OTHER_TOPICS);
+ break;
+ case PARAM_ALGORITHM:
+ algorithmName = params.get(PARAM_ALGORITHM);
+ break;
+ case PARAM_RESOURCES:
+ resources = params.get(PARAM_RESOURCES);
+ break;
+ case PARAM_LANGUAGE:
+ language = params.get(PARAM_LANGUAGE);
+ break;
+ case PARAM_LANGUAGE_FIELD:
+ languageField = params.get(PARAM_LANGUAGE_FIELD);
+ break;
+ case PARAM_PREFER_QUERY_CONTEXT:
+ preferQueryContext =
params.getBool(PARAM_PREFER_QUERY_CONTEXT);
+ break;
+ case PARAM_CONTEXT_COUNT:
+ contextCount = params.getPrimitiveInt(PARAM_CONTEXT_COUNT);
+ break;
+ case PARAM_CONTEXT_SIZE:
+ contextSize = params.getPrimitiveInt(PARAM_CONTEXT_SIZE);
+ break;
+ case PARAM_FIELDS:
+
fields.addAll(Arrays.asList(params.get(PARAM_FIELDS).split("[,]\\s*")));
+ break;
+ default:
+ // Unrecognized parameter. Preserve it.
+ String[] value = e.getValue();
+ if (value != null) {
+ if (value.length == 1) {
+ otherParameters.put(e.getKey(), value[0]);
+ } else {
+ otherParameters.put(e.getKey(), String.join(", ",
value));
+ }
+ }
+ break;
+ }
+ });
return this;
}
- /**
- * @return Maximum number of returned cluster labels (even if the algorithm
- * returns more).
- */
+ /** @return Maximum number of returned cluster labels (even if the algorithm
returns more). */
int maxLabels() {
return maxLabels;
}
/**
- * @return If {@code true}, include subclusters in response (if the algorithm
- * produces hierarchical clustering).
+ * @return If {@code true}, include subclusters in response (if the
algorithm produces
+ * hierarchical clustering).
*/
boolean includeSubclusters() {
return includeSubclusters;
}
/**
- * @return If {@code true}, include a synthetic cluster called "Other
Topics" that
- * consists of all documents not assigned to any other cluster.
+ * @return If {@code true}, include a synthetic cluster called "Other
Topics" that consists of all
+ * documents not assigned to any other cluster.
*/
boolean includeOtherTopics() {
return includeOtherTopics;
}
/**
- * @return Name of the clustering algorithm to use (as loaded via the service
- * * extension point {@link org.carrot2.clustering.ClusteringAlgorithm}).
+ * @return Name of the clustering algorithm to use (as loaded via the
service * extension point
+ * {@link org.carrot2.clustering.ClusteringAlgorithm}).
*/
String algorithmName() {
return algorithmName;
}
- /**
- * @return Return Solr component-configuration relative language resources
path.
- */
+ /** @return Return Solr component-configuration relative language resources
path. */
String resources() {
return resources;
}
/**
- * @return Name of the default language to use for clustering. The
corresponding
- * {@link org.carrot2.language.LanguageComponents} must be available (loaded
via
- * service provider extension).
+ * @return Name of the default language to use for clustering. The
corresponding {@link
+ * org.carrot2.language.LanguageComponents} must be available (loaded
via service provider
+ * extension).
*/
String language() {
return language;
}
/**
- * @return Name of the field that carries each document's language. {@code
null} value
- * means all documents will be clustered according to the default {@link
#language()}.
- * If not {@code null} and the document's field has a missing value, it will
be clustered
- * using the default {@link #language()} as well.
+ * @return Name of the field that carries each document's language. {@code
null} value means all
+ * documents will be clustered according to the default {@link
#language()}. If not {@code
+ * null} and the document's field has a missing value, it will be
clustered using the default
+ * {@link #language()} as well.
*/
String languageField() {
return languageField;
@@ -276,31 +222,32 @@ public final class EngineParameters implements Cloneable {
/**
* @return Names of all fields whose textual content will be passed to the
clustering engine.
- * Comma or space separated.
+ * Comma or space separated.
*/
Set<String> fields() {
return fields;
}
/**
- * @return Returns {@code true} if clustering should try to extract context
fragments
- * around the matching query regions rather than use full field content.
Such context snippets
- * typically cluster well because they carry a more compact and
query-related information.
+ * @return Returns {@code true} if clustering should try to extract context
fragments around the
+ * matching query regions rather than use full field content. Such
context snippets typically
+ * cluster well because they carry a more compact and query-related
information.
*/
boolean preferQueryContext() {
return preferQueryContext;
}
/**
- * @return Returns the maximum query context window to use if {@link
#preferQueryContext()} is {@code true}.
+ * @return Returns the maximum query context window to use if {@link
#preferQueryContext()} is
+ * {@code true}.
*/
int contextSize() {
return contextSize;
}
/**
- * @return Returns the maximum number of different, non-contiguous query
context snippets from a single field
- * if {@link #preferQueryContext()} is {@code true}.
+ * @return Returns the maximum number of different, non-contiguous query
context snippets from a
+ * single field if {@link #preferQueryContext()} is {@code true}.
*/
int contextCount() {
return contextCount;
@@ -323,8 +270,8 @@ public final class EngineParameters implements Cloneable {
}
/**
- * @return Return a copy of the argument with any parameters present in
- * {@code params} overriding this object defaults.
+ * @return Return a copy of the argument with any parameters present in
{@code params} overriding
+ * this object defaults.
*/
EngineParameters derivedFrom(SolrParams params) {
EngineParameters cloned = this.clone();
diff --git
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/FlatKeysAttrVisitor.java
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/FlatKeysAttrVisitor.java
index 25db082..31f73b5 100644
---
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/FlatKeysAttrVisitor.java
+++
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/FlatKeysAttrVisitor.java
@@ -16,6 +16,15 @@
*/
package org.apache.solr.handler.clustering;
+import java.util.ArrayDeque;
+import java.util.Arrays;
+import java.util.EnumSet;
+import java.util.LinkedHashMap;
+import java.util.Locale;
+import java.util.Objects;
+import java.util.function.BiConsumer;
+import java.util.function.Consumer;
+import java.util.function.Function;
import org.carrot2.attrs.AcceptingVisitor;
import org.carrot2.attrs.AliasMapper;
import org.carrot2.attrs.AttrBoolean;
@@ -28,19 +37,9 @@ import org.carrot2.attrs.AttrString;
import org.carrot2.attrs.AttrStringArray;
import org.carrot2.attrs.AttrVisitor;
-import java.util.ArrayDeque;
-import java.util.Arrays;
-import java.util.EnumSet;
-import java.util.LinkedHashMap;
-import java.util.Locale;
-import java.util.Objects;
-import java.util.function.BiConsumer;
-import java.util.function.Consumer;
-import java.util.function.Function;
-
/**
- * {@link AttrVisitor} that responds to "flattened" key paths and values,
updating
- * corresponding algorithm parameters with values contained in the map.
+ * {@link AttrVisitor} that responds to "flattened" key paths and values,
updating corresponding
+ * algorithm parameters with values contained in the map.
*/
class FlatKeysAttrVisitor implements AttrVisitor {
final Function<String, Object> classToInstance =
AliasMapper.SPI_DEFAULTS::fromName;
@@ -49,9 +48,8 @@ class FlatKeysAttrVisitor implements AttrVisitor {
final LinkedHashMap<String, String> attrs;
/**
- * @param attrs A map of attributes to set. Note the map has ordered keys:
- * this is required for complex sub-types so that instantiation
of
- * a value precedes setting its attributes.
+ * @param attrs A map of attributes to set. Note the map has ordered keys:
this is required for
+ * complex sub-types so that instantiation of a value precedes setting
its attributes.
*/
FlatKeysAttrVisitor(LinkedHashMap<String, String> attrs) {
this.attrs = attrs;
@@ -59,87 +57,109 @@ class FlatKeysAttrVisitor implements AttrVisitor {
@Override
public void visit(String key, AttrBoolean attr) {
- ifKeyExists(key, (path, value) -> {
- attr.set(value == null ? null : Boolean.parseBoolean(value));
- });
+ ifKeyExists(
+ key,
+ (path, value) -> {
+ attr.set(value == null ? null : Boolean.parseBoolean(value));
+ });
}
@Override
public void visit(String key, AttrInteger attr) {
- ifKeyExists(key, (path, value) -> {
- attr.set(value == null ? null : Integer.parseInt(value));
- });
+ ifKeyExists(
+ key,
+ (path, value) -> {
+ attr.set(value == null ? null : Integer.parseInt(value));
+ });
}
@Override
public void visit(String key, AttrDouble attr) {
- ifKeyExists(key, (path, value) -> {
- attr.set(value == null ? null : Double.parseDouble(value));
- });
+ ifKeyExists(
+ key,
+ (path, value) -> {
+ attr.set(value == null ? null : Double.parseDouble(value));
+ });
}
@Override
public void visit(String key, AttrString attr) {
- ifKeyExists(key, (path, value) -> {
- attr.set(value);
- });
+ ifKeyExists(
+ key,
+ (path, value) -> {
+ attr.set(value);
+ });
}
@Override
public void visit(String key, AttrStringArray attr) {
- ifKeyExists(key, (path, value) -> {
- if (value == null) {
- attr.set(new String[0]);
- } else {
- attr.set(value.split(",\\s*"));
- }
- });
+ ifKeyExists(
+ key,
+ (path, value) -> {
+ if (value == null) {
+ attr.set(new String[0]);
+ } else {
+ attr.set(value.split(",\\s*"));
+ }
+ });
}
@Override
public <T extends Enum<T>> void visit(String key, AttrEnum<T> attr) {
- ifKeyExists(key, (path, value) -> {
- try {
- attr.set(Enum.valueOf(attr.enumClass(), value));
- } catch (IllegalArgumentException e) {
- throw new IllegalArgumentException(
- String.format(
- Locale.ROOT,
- "Value at key '%s' should be an enum constant of class '%s',
but no such " +
- "constant exists: '%s' (available constants: %s)",
- key,
- attr.enumClass().getSimpleName(),
- toDebugString(value),
- EnumSet.allOf(attr.enumClass())));
- }
- });
+ ifKeyExists(
+ key,
+ (path, value) -> {
+ try {
+ attr.set(Enum.valueOf(attr.enumClass(), value));
+ } catch (IllegalArgumentException e) {
+ throw new IllegalArgumentException(
+ String.format(
+ Locale.ROOT,
+ "Value at key '%s' should be an enum constant of class
'%s', but no such "
+ + "constant exists: '%s' (available constants: %s)",
+ key,
+ attr.enumClass().getSimpleName(),
+ toDebugString(value),
+ EnumSet.allOf(attr.enumClass())));
+ }
+ });
}
@Override
public <T extends AcceptingVisitor> void visit(String key, AttrObject<T>
attr) {
- ifKeyExists(key, (path, value) -> {
- if (value == null) {
- attr.set(null);
- } else {
- T t = safeCast(classToInstance.apply(value), key,
attr.getInterfaceClass());
- attr.set(t);
- }
- });
+ ifKeyExists(
+ key,
+ (path, value) -> {
+ if (value == null) {
+ attr.set(null);
+ } else {
+ T t = safeCast(classToInstance.apply(value), key,
attr.getInterfaceClass());
+ attr.set(t);
+ }
+ });
T t = attr.get();
if (t != null) {
- withKey(key, path -> {
- t.accept(this);
- });
+ withKey(
+ key,
+ path -> {
+ t.accept(this);
+ });
}
}
@Override
public <T extends AcceptingVisitor> void visit(String key,
AttrObjectArray<T> attr) {
- ifKeyExists(key, (path, value) -> {
- throw new RuntimeException("Setting arrays of objects not implemented
for attribute: "
- + key + " (" + attr.getDescription() + ")");
- });
+ ifKeyExists(
+ key,
+ (path, value) -> {
+ throw new RuntimeException(
+ "Setting arrays of objects not implemented for attribute: "
+ + key
+ + " ("
+ + attr.getDescription()
+ + ")");
+ });
}
private <T> T safeCast(Object value, String key, Class<T> clazz) {
@@ -181,14 +201,16 @@ class FlatKeysAttrVisitor implements AttrVisitor {
}
private void ifKeyExists(String key, BiConsumer<String, String>
pathConsumer) {
- withKey(key, (path) -> {
- if (attrs.containsKey(path)) {
- String value = attrs.get(path);
- if (value.trim().isEmpty()) {
- value = null;
- }
- pathConsumer.accept(path, value);
- }
- });
+ withKey(
+ key,
+ (path) -> {
+ if (attrs.containsKey(path)) {
+ String value = attrs.get(path);
+ if (value.trim().isEmpty()) {
+ value = null;
+ }
+ pathConsumer.accept(path, value);
+ }
+ });
}
}
diff --git
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/InputDocument.java
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/InputDocument.java
index a45eda0..42b9ad5 100644
---
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/InputDocument.java
+++
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/InputDocument.java
@@ -16,18 +16,15 @@
*/
package org.apache.solr.handler.clustering;
-import org.carrot2.clustering.Document;
-
import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.function.BiConsumer;
import java.util.stream.Collectors;
+import org.carrot2.clustering.Document;
-/**
- * Representation of a single logical "document" for clustering.
- */
+/** Representation of a single logical "document" for clustering. */
final class InputDocument implements Document {
private final Object id;
private final Map<String, String> clusteredFields = new LinkedHashMap<>();
@@ -58,10 +55,13 @@ final class InputDocument implements Document {
@Override
public String toString() {
- return String.format(Locale.ROOT,
+ return String.format(
+ Locale.ROOT,
"doc[%s, lang=%s, fields=%s]",
getId(),
language,
- clusteredFields.entrySet().stream().map(e -> e.getKey() + ": " +
e.getValue()).collect(Collectors.joining(", ")));
+ clusteredFields.entrySet().stream()
+ .map(e -> e.getKey() + ": " + e.getValue())
+ .collect(Collectors.joining(", ")));
}
}
diff --git
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/PathResourceLookup.java
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/PathResourceLookup.java
index 2c07e83..fe0d9c2 100644
---
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/PathResourceLookup.java
+++
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/PathResourceLookup.java
@@ -16,8 +16,6 @@
*/
package org.apache.solr.handler.clustering;
-import org.carrot2.util.ResourceLookup;
-
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -25,10 +23,9 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.stream.Collectors;
+import org.carrot2.util.ResourceLookup;
-/**
- * Carrot2 resource provider from the provided list of filesystem paths.
- */
+/** Carrot2 resource provider from the provided list of filesystem paths. */
final class PathResourceLookup implements ResourceLookup {
private final List<Path> locations;
diff --git
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/package-info.java
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/package-info.java
index 565001c..ce5a2ae 100644
---
a/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/package-info.java
+++
b/solr/modules/clustering/src/java/org/apache/solr/handler/clustering/package-info.java
@@ -16,17 +16,11 @@
*/
/**
- * A {@link org.apache.solr.handler.component.SearchComponent} for dynamic,
- * unsupervised grouping of
- * search results based on the content of their text fields or contextual
- * snippets around query-matching regions.
+ * A {@link org.apache.solr.handler.component.SearchComponent} for dynamic,
unsupervised grouping of
+ * search results based on the content of their text fields or contextual
snippets around
+ * query-matching regions.
*
- * <p>
- * The default implementation uses clustering algorithms from the
- * <a href="https://project.carrot2.org">Carrot<sup>2</sup> project</a>.
+ * <p>The default implementation uses clustering algorithms from the <a
+ * href="https://project.carrot2.org">Carrot<sup>2</sup> project</a>.
*/
package org.apache.solr.handler.clustering;
-
-
-
-
diff --git
a/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/ClusteringComponentDistributedTest.java
b/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/ClusteringComponentDistributedTest.java
index bbd4fba..1a9398a 100644
---
a/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/ClusteringComponentDistributedTest.java
+++
b/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/ClusteringComponentDistributedTest.java
@@ -16,6 +16,10 @@
*/
package org.apache.solr.handler.clustering;
+import java.io.IOException;
+import java.util.List;
+import java.util.function.Consumer;
+import java.util.stream.Collectors;
import org.apache.solr.BaseDistributedSearchTestCase;
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
import org.apache.solr.client.solrj.response.Cluster;
@@ -27,14 +31,9 @@ import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
-import java.io.IOException;
-import java.util.List;
-import java.util.function.Consumer;
-import java.util.stream.Collectors;
-
@SuppressSSL
public class ClusteringComponentDistributedTest extends
BaseDistributedSearchTestCase {
- private final static String QUERY_TESTSET_SAMPLE_DOCUMENTS =
"testSet:sampleDocs";
+ private static final String QUERY_TESTSET_SAMPLE_DOCUMENTS =
"testSet:sampleDocs";
@Override
public String getSolrHome() {
@@ -46,10 +45,7 @@ public class ClusteringComponentDistributedTest extends
BaseDistributedSearchTes
del("*:*");
String[] languages = {
- "English",
- "French",
- "German",
- "Unknown",
+ "English", "French", "German", "Unknown",
};
int docId = 0;
@@ -59,8 +55,7 @@ public class ClusteringComponentDistributedTest extends
BaseDistributedSearchTes
"title", doc[0],
"snippet", doc[1],
"testSet", "sampleDocs",
- "lang", languages[docId % languages.length]
- );
+ "lang", languages[docId % languages.length]);
docId++;
}
commit();
@@ -69,17 +64,23 @@ public class ClusteringComponentDistributedTest extends
BaseDistributedSearchTes
@Test
@ShardsFixed(num = 2)
public void testLingoAlgorithm() throws Exception {
- compareToExpected(clusters(QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
- params.add(ClusteringComponent.REQUEST_PARAM_ENGINE, "lingo");
- }));
+ compareToExpected(
+ clusters(
+ QUERY_TESTSET_SAMPLE_DOCUMENTS,
+ params -> {
+ params.add(ClusteringComponent.REQUEST_PARAM_ENGINE, "lingo");
+ }));
}
@Test
@ShardsFixed(num = 2)
public void testStcAlgorithm() throws Exception {
- compareToExpected(clusters(QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
- params.add(ClusteringComponent.REQUEST_PARAM_ENGINE, "stc");
- }));
+ compareToExpected(
+ clusters(
+ QUERY_TESTSET_SAMPLE_DOCUMENTS,
+ params -> {
+ params.add(ClusteringComponent.REQUEST_PARAM_ENGINE, "stc");
+ }));
}
private void compareToExpected(List<Cluster> actual) throws IOException {
@@ -88,7 +89,8 @@ public class ClusteringComponentDistributedTest extends
BaseDistributedSearchTes
ClusteringComponentTest.compareWhitespaceNormalized(toString(actual),
expected);
}
- private List<Cluster> clusters(String query, Consumer<ModifiableSolrParams>
paramsConsumer) throws Exception {
+ private List<Cluster> clusters(String query, Consumer<ModifiableSolrParams>
paramsConsumer)
+ throws Exception {
handle.clear();
handle.put("responseHeader", SKIP);
handle.put("response", SKIP);
@@ -113,18 +115,19 @@ public class ClusteringComponentDistributedTest extends
BaseDistributedSearchTes
}
private StringBuilder toString(List<Cluster> clusters, String indent,
StringBuilder sb) {
- clusters.forEach(c -> {
- sb.append(indent);
- sb.append("- " + c.getLabels().stream().collect(Collectors.joining(";
")));
- if (!c.getDocs().isEmpty()) {
- sb.append(" [" + c.getDocs().size() + "]");
- }
- sb.append("\n");
-
- if (!c.getClusters().isEmpty()) {
- toString(c.getClusters(), indent + " ", sb);
- }
- });
+ clusters.forEach(
+ c -> {
+ sb.append(indent);
+ sb.append("- " +
c.getLabels().stream().collect(Collectors.joining("; ")));
+ if (!c.getDocs().isEmpty()) {
+ sb.append(" [" + c.getDocs().size() + "]");
+ }
+ sb.append("\n");
+
+ if (!c.getClusters().isEmpty()) {
+ toString(c.getClusters(), indent + " ", sb);
+ }
+ });
return sb;
}
}
diff --git
a/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/ClusteringComponentTest.java
b/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/ClusteringComponentTest.java
index d77f830..0c6c968 100644
---
a/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/ClusteringComponentTest.java
+++
b/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/ClusteringComponentTest.java
@@ -17,6 +17,19 @@
package org.apache.solr.handler.clustering;
import com.carrotsearch.randomizedtesting.RandomizedContext;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Objects;
+import java.util.function.Consumer;
+import java.util.function.Function;
+import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.client.solrj.response.ClusteringResponse;
@@ -38,25 +51,9 @@ import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Objects;
-import java.util.function.Consumer;
-import java.util.function.Function;
-import java.util.stream.Collectors;
-
-/**
- * Tests {@link Engine}.
- */
+/** Tests {@link Engine}. */
public class ClusteringComponentTest extends SolrTestCaseJ4 {
- private final static String QUERY_TESTSET_SAMPLE_DOCUMENTS =
"testSet:sampleDocs";
+ private static final String QUERY_TESTSET_SAMPLE_DOCUMENTS =
"testSet:sampleDocs";
@BeforeClass
public static void beforeClass() throws Exception {
@@ -65,20 +62,19 @@ public class ClusteringComponentTest extends SolrTestCaseJ4
{
initCore("solrconfig.xml", "schema.xml", testHome.getAbsolutePath());
String[] languages = {
- "English",
- "French",
- "German",
- "Unknown",
+ "English", "French", "German", "Unknown",
};
int docId = 0;
for (String[] doc : SampleData.SAMPLE_DOCUMENTS) {
- assertNull(h.validateUpdate(adoc(
- "id", Integer.toString(docId),
- "title", doc[0],
- "snippet", doc[1],
- "testSet", "sampleDocs",
- "lang", languages[docId % languages.length])));
+ assertNull(
+ h.validateUpdate(
+ adoc(
+ "id", Integer.toString(docId),
+ "title", doc[0],
+ "snippet", doc[1],
+ "testSet", "sampleDocs",
+ "lang", languages[docId % languages.length])));
docId++;
}
@@ -102,44 +98,67 @@ public class ClusteringComponentTest extends
SolrTestCaseJ4 {
@Test
public void testParamSubclusters() throws Exception {
- compareToExpected("off", clusters("mock", QUERY_TESTSET_SAMPLE_DOCUMENTS,
params -> {
- params.set(EngineParameters.PARAM_INCLUDE_SUBCLUSTERS, false);
- }));
- compareToExpected("on", clusters("mock", QUERY_TESTSET_SAMPLE_DOCUMENTS,
params -> {
- params.set(EngineParameters.PARAM_INCLUDE_SUBCLUSTERS, true);
- }));
+ compareToExpected(
+ "off",
+ clusters(
+ "mock",
+ QUERY_TESTSET_SAMPLE_DOCUMENTS,
+ params -> {
+ params.set(EngineParameters.PARAM_INCLUDE_SUBCLUSTERS, false);
+ }));
+ compareToExpected(
+ "on",
+ clusters(
+ "mock",
+ QUERY_TESTSET_SAMPLE_DOCUMENTS,
+ params -> {
+ params.set(EngineParameters.PARAM_INCLUDE_SUBCLUSTERS, true);
+ }));
}
@Test
public void testParamOtherTopics() throws Exception {
- compareToExpected(clusters("mock", QUERY_TESTSET_SAMPLE_DOCUMENTS, params
-> {
- params.set(EngineParameters.PARAM_INCLUDE_OTHER_TOPICS, false);
- }));
+ compareToExpected(
+ clusters(
+ "mock",
+ QUERY_TESTSET_SAMPLE_DOCUMENTS,
+ params -> {
+ params.set(EngineParameters.PARAM_INCLUDE_OTHER_TOPICS, false);
+ }));
}
/**
- * We'll make two queries, one with- and another one without summary
- * and assert that documents are shorter when highlighter is in use.
+ * We'll make two queries, one with- and another one without summary and
assert that documents are
+ * shorter when highlighter is in use.
*/
@Test
public void testClusteringOnHighlights() throws Exception {
String query = "+snippet:mine +" + QUERY_TESTSET_SAMPLE_DOCUMENTS;
- Consumer<ModifiableSolrParams> common = params -> {
- params.add(EngineParameters.PARAM_FIELDS, "title, snippet");
- params.add(EngineParameters.PARAM_CONTEXT_SIZE, Integer.toString(80));
- params.add(EngineParameters.PARAM_CONTEXT_COUNT, Integer.toString(1));
- };
-
- List<Cluster<SolrDocument>> highlighted = clusters("echo", query,
- common.andThen(params -> {
- params.add(EngineParameters.PARAM_PREFER_QUERY_CONTEXT, "true");
- }));
-
- List<Cluster<SolrDocument>> full = clusters("echo", query,
- common.andThen(params -> {
- params.add(EngineParameters.PARAM_PREFER_QUERY_CONTEXT, "false");
- }));
+ Consumer<ModifiableSolrParams> common =
+ params -> {
+ params.add(EngineParameters.PARAM_FIELDS, "title, snippet");
+ params.add(EngineParameters.PARAM_CONTEXT_SIZE,
Integer.toString(80));
+ params.add(EngineParameters.PARAM_CONTEXT_COUNT,
Integer.toString(1));
+ };
+
+ List<Cluster<SolrDocument>> highlighted =
+ clusters(
+ "echo",
+ query,
+ common.andThen(
+ params -> {
+ params.add(EngineParameters.PARAM_PREFER_QUERY_CONTEXT,
"true");
+ }));
+
+ List<Cluster<SolrDocument>> full =
+ clusters(
+ "echo",
+ query,
+ common.andThen(
+ params -> {
+ params.add(EngineParameters.PARAM_PREFER_QUERY_CONTEXT,
"false");
+ }));
// Echo clustering algorithm just returns document fields as cluster labels
// so highlighted snippets should never be longer than full field content.
@@ -149,7 +168,8 @@ public class ClusteringComponentTest extends SolrTestCaseJ4
{
List<String> labels2 = full.get(i).getLabels();
assertEquals(labels1.size(), labels2.size());
for (int j = 0; j < labels1.size(); j++) {
- MatcherAssert.assertThat("Summary shorter than original document?",
+ MatcherAssert.assertThat(
+ "Summary shorter than original document?",
labels1.get(j).length(),
Matchers.lessThanOrEqualTo(labels2.get(j).length()));
}
@@ -157,28 +177,37 @@ public class ClusteringComponentTest extends
SolrTestCaseJ4 {
}
/**
- * We'll make two queries, one short summaries and another one with longer
- * summaries and will check that the results differ.
+ * We'll make two queries, one short summaries and another one with longer
summaries and will
+ * check that the results differ.
*/
@Test
public void testSummaryFragSize() throws Exception {
String query = "+snippet:mine +" + QUERY_TESTSET_SAMPLE_DOCUMENTS;
- Consumer<ModifiableSolrParams> common = params -> {
- params.add(EngineParameters.PARAM_PREFER_QUERY_CONTEXT, "true");
- params.add(EngineParameters.PARAM_FIELDS, "title, snippet");
- params.add(EngineParameters.PARAM_CONTEXT_COUNT, Integer.toString(1));
- };
-
- List<Cluster<SolrDocument>> shortSummaries = clusters("echo", query,
- common.andThen(params -> {
- params.add(EngineParameters.PARAM_CONTEXT_SIZE,
Integer.toString(30));
- }));
-
- List<Cluster<SolrDocument>> longSummaries = clusters("echo", query,
- common.andThen(params -> {
- params.add(EngineParameters.PARAM_CONTEXT_COUNT,
Integer.toString(80));
- }));
+ Consumer<ModifiableSolrParams> common =
+ params -> {
+ params.add(EngineParameters.PARAM_PREFER_QUERY_CONTEXT, "true");
+ params.add(EngineParameters.PARAM_FIELDS, "title, snippet");
+ params.add(EngineParameters.PARAM_CONTEXT_COUNT,
Integer.toString(1));
+ };
+
+ List<Cluster<SolrDocument>> shortSummaries =
+ clusters(
+ "echo",
+ query,
+ common.andThen(
+ params -> {
+ params.add(EngineParameters.PARAM_CONTEXT_SIZE,
Integer.toString(30));
+ }));
+
+ List<Cluster<SolrDocument>> longSummaries =
+ clusters(
+ "echo",
+ query,
+ common.andThen(
+ params -> {
+ params.add(EngineParameters.PARAM_CONTEXT_COUNT,
Integer.toString(80));
+ }));
Assert.assertEquals(shortSummaries.size(), longSummaries.size());
for (int i = 0; i < shortSummaries.size(); i++) {
@@ -186,111 +215,117 @@ public class ClusteringComponentTest extends
SolrTestCaseJ4 {
List<String> longLabels = longSummaries.get(i).getLabels();
assertEquals(shortLabels.size(), longLabels.size());
for (int j = 0; j < shortLabels.size(); j++) {
- MatcherAssert.assertThat("Shorter summary is longer than longer
summary?",
+ MatcherAssert.assertThat(
+ "Shorter summary is longer than longer summary?",
shortLabels.get(j).length(),
Matchers.lessThanOrEqualTo(longLabels.get(j).length()));
}
}
}
- /**
- * Test passing algorithm parameters via SolrParams.
- */
+ /** Test passing algorithm parameters via SolrParams. */
@Test
public void testPassingAttributes() throws Exception {
- compareToExpected(clusters("mock", QUERY_TESTSET_SAMPLE_DOCUMENTS, params
-> {
- params.set("maxClusters", 2);
- params.set("hierarchyDepth", 1);
- params.add(EngineParameters.PARAM_INCLUDE_OTHER_TOPICS, "false");
- }));
+ compareToExpected(
+ clusters(
+ "mock",
+ QUERY_TESTSET_SAMPLE_DOCUMENTS,
+ params -> {
+ params.set("maxClusters", 2);
+ params.set("hierarchyDepth", 1);
+ params.add(EngineParameters.PARAM_INCLUDE_OTHER_TOPICS, "false");
+ }));
}
- /**
- * Test passing algorithm parameters via Solr configuration file.
- */
+ /** Test passing algorithm parameters via Solr configuration file. */
@Test
public void testPassingAttributesViaSolrConfig() throws Exception {
compareToExpected(clusters("mock-solrconfig-attrs",
QUERY_TESTSET_SAMPLE_DOCUMENTS));
}
- /**
- * Test maximum label truncation.
- */
+ /** Test maximum label truncation. */
@Test
public void testParamMaxLabels() throws Exception {
- List<Cluster<SolrDocument>> clusters = clusters("mock",
QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
- params.set("labelsPerCluster", "5");
- params.set(EngineParameters.PARAM_INCLUDE_OTHER_TOPICS, "false");
- params.set(EngineParameters.PARAM_MAX_LABELS, "3");
- });
-
- clusters.forEach(c -> {
- MatcherAssert.assertThat(c.getLabels(), Matchers.hasSize(3));
- });
+ List<Cluster<SolrDocument>> clusters =
+ clusters(
+ "mock",
+ QUERY_TESTSET_SAMPLE_DOCUMENTS,
+ params -> {
+ params.set("labelsPerCluster", "5");
+ params.set(EngineParameters.PARAM_INCLUDE_OTHER_TOPICS, "false");
+ params.set(EngineParameters.PARAM_MAX_LABELS, "3");
+ });
+
+ clusters.forEach(
+ c -> {
+ MatcherAssert.assertThat(c.getLabels(), Matchers.hasSize(3));
+ });
}
@Test
public void testCustomLanguageResources() throws Exception {
- compareToExpected(clusters(
- "testCustomLanguageResources",
- QUERY_TESTSET_SAMPLE_DOCUMENTS));
+ compareToExpected(clusters("testCustomLanguageResources",
QUERY_TESTSET_SAMPLE_DOCUMENTS));
}
@Test
public void testParamDefaultLanguage() throws Exception {
- compareToExpected(clusters(
- "testParamDefaultLanguage",
- QUERY_TESTSET_SAMPLE_DOCUMENTS));
+ compareToExpected(clusters("testParamDefaultLanguage",
QUERY_TESTSET_SAMPLE_DOCUMENTS));
}
/**
- * Verify that documents with an explicit language name
- * field are clustered in separate batches.
+ * Verify that documents with an explicit language name field are clustered
in separate batches.
*
* @see EngineParameters#PARAM_LANGUAGE_FIELD
*/
@Test
public void testParamLanguageField() throws Exception {
- compareToExpected(clusters(
- "testParamLanguageField",
- QUERY_TESTSET_SAMPLE_DOCUMENTS));
+ compareToExpected(clusters("testParamLanguageField",
QUERY_TESTSET_SAMPLE_DOCUMENTS));
}
private void compareToExpected(List<Cluster<SolrDocument>> clusters) throws
IOException {
compareToExpected("", clusters);
}
- private void compareToExpected(String resourceSuffix,
- List<Cluster<SolrDocument>> clusters) throws
IOException {
+ private void compareToExpected(String resourceSuffix,
List<Cluster<SolrDocument>> clusters)
+ throws IOException {
String actual = toString(clusters);
String expected = getTestResource(getClass(), resourceSuffix);
compareWhitespaceNormalized(actual, expected);
}
static void compareWhitespaceNormalized(String actual, String expected) {
- Function<String, String> normalize = v -> v.replaceAll("\r",
"").replaceAll("[ \t]+", " ").trim();
+ Function<String, String> normalize =
+ v -> v.replaceAll("\r", "").replaceAll("[ \t]+", " ").trim();
if (!normalize.apply(expected).equals(normalize.apply(actual))) {
- throw new AssertionError(String.format(Locale.ROOT,
- "The actual clusters structure differs from the expected one.
Expected:\n%s\n\nActual:\n%s",
- expected,
- actual));
+ throw new AssertionError(
+ String.format(
+ Locale.ROOT,
+ "The actual clusters structure differs from the expected one.
Expected:\n%s\n\nActual:\n%s",
+ expected,
+ actual));
}
}
static String getTestResource(Class<?> clazz, String expectedResourceSuffix)
throws IOException {
RandomizedContext ctx = RandomizedContext.current();
- String resourceName = String.format(Locale.ROOT,
- "%s-%s%s.txt",
- ctx.getTargetClass().getSimpleName(),
- ctx.getTargetMethod().getName(),
- expectedResourceSuffix.isEmpty() ? "" : "-" + expectedResourceSuffix);
+ String resourceName =
+ String.format(
+ Locale.ROOT,
+ "%s-%s%s.txt",
+ ctx.getTargetClass().getSimpleName(),
+ ctx.getTargetMethod().getName(),
+ expectedResourceSuffix.isEmpty() ? "" : "-" +
expectedResourceSuffix);
String expected;
try (InputStream is = clazz.getResourceAsStream(resourceName)) {
if (is == null) {
- throw new AssertionError("Test resource not found: " + resourceName +
" (class-relative to " +
- clazz.getName() + ")");
+ throw new AssertionError(
+ "Test resource not found: "
+ + resourceName
+ + " (class-relative to "
+ + clazz.getName()
+ + ")");
}
expected = new String(is.readAllBytes(), StandardCharsets.UTF_8);
@@ -302,33 +337,38 @@ public class ClusteringComponentTest extends
SolrTestCaseJ4 {
return toString(clusters, "", new StringBuilder()).toString();
}
- private StringBuilder toString(List<Cluster<SolrDocument>> clusters, String
indent, StringBuilder sb) {
- clusters.forEach(c -> {
- sb.append(indent);
- sb.append("- " + c.getLabels().stream().collect(Collectors.joining(";
")));
- if (!c.getDocuments().isEmpty()) {
- sb.append(" [" + c.getDocuments().size() + "]");
- }
- sb.append("\n");
-
- if (!c.getClusters().isEmpty()) {
- toString(c.getClusters(), indent + " ", sb);
- }
- });
+ private StringBuilder toString(
+ List<Cluster<SolrDocument>> clusters, String indent, StringBuilder sb) {
+ clusters.forEach(
+ c -> {
+ sb.append(indent);
+ sb.append("- " +
c.getLabels().stream().collect(Collectors.joining("; ")));
+ if (!c.getDocuments().isEmpty()) {
+ sb.append(" [" + c.getDocuments().size() + "]");
+ }
+ sb.append("\n");
+
+ if (!c.getClusters().isEmpty()) {
+ toString(c.getClusters(), indent + " ", sb);
+ }
+ });
return sb;
}
- private List<Cluster<SolrDocument>> clusters(String engineName, String
query, Consumer<ModifiableSolrParams> paramsConsumer) {
+ private List<Cluster<SolrDocument>> clusters(
+ String engineName, String query, Consumer<ModifiableSolrParams>
paramsConsumer) {
return clusters("/select", engineName, query, paramsConsumer);
}
private List<Cluster<SolrDocument>> clusters(String engineName, String
query) {
- return clusters("/select", engineName, query, params -> {
- });
+ return clusters("/select", engineName, query, params -> {});
}
- private List<Cluster<SolrDocument>> clusters(String handlerName, String
engineName, String query,
- Consumer<ModifiableSolrParams>
paramsConsumer) {
+ private List<Cluster<SolrDocument>> clusters(
+ String handlerName,
+ String engineName,
+ String query,
+ Consumer<ModifiableSolrParams> paramsConsumer) {
SolrCore core = h.getCore();
ModifiableSolrParams reqParams = new ModifiableSolrParams();
@@ -339,10 +379,12 @@ public class ClusteringComponentTest extends
SolrTestCaseJ4 {
paramsConsumer.accept(reqParams);
SearchHandler handler = (SearchHandler)
core.getRequestHandler(handlerName);
- assertTrue("Clustering engine named '" + engineName + "' exists.",
handler.getComponents().stream()
- .filter(c -> c instanceof ClusteringComponent)
- .flatMap(c -> ((ClusteringComponent) c).getEngineNames().stream())
- .anyMatch(localName -> Objects.equals(localName, engineName)));
+ assertTrue(
+ "Clustering engine named '" + engineName + "' exists.",
+ handler.getComponents().stream()
+ .filter(c -> c instanceof ClusteringComponent)
+ .flatMap(c -> ((ClusteringComponent) c).getEngineNames().stream())
+ .anyMatch(localName -> Objects.equals(localName, engineName)));
SolrQueryResponse rsp = new SolrQueryResponse();
rsp.addResponseHeader(new SimpleOrderedMap<>());
@@ -367,29 +409,33 @@ public class ClusteringComponentTest extends
SolrTestCaseJ4 {
@SuppressWarnings("unchecked")
private Cluster<SolrDocument> toCluster(NamedList<Object> v, Map<String,
SolrDocument> idToDoc) {
Cluster<SolrDocument> c = new Cluster<>();
- v.forEach((key, value) -> {
- switch (key) {
- case ClusteringResponse.DOCS_NODE:
- ((List<String>) value).forEach(docId ->
c.addDocument(idToDoc.get(docId)));
- break;
- case ClusteringResponse.LABELS_NODE:
- ((List<String>) value).forEach(c::addLabel);
- break;
- case ClusteringResponse.SCORE_NODE:
- c.setScore(((Number) value).doubleValue());
- break;
- case ClusteringResponse.CLUSTERS_NODE:
- ((List<NamedList<Object>>) value).forEach(sub -> {
- c.addCluster(toCluster(sub, idToDoc));
- });
- break;
- case ClusteringResponse.IS_OTHER_TOPICS:
- // Just ignore the attribute.
- break;
- default:
- throw new RuntimeException("Unknown output property " + key + " in
cluster: " + v.jsonStr());
- }
- });
+ v.forEach(
+ (key, value) -> {
+ switch (key) {
+ case ClusteringResponse.DOCS_NODE:
+ ((List<String>) value).forEach(docId ->
c.addDocument(idToDoc.get(docId)));
+ break;
+ case ClusteringResponse.LABELS_NODE:
+ ((List<String>) value).forEach(c::addLabel);
+ break;
+ case ClusteringResponse.SCORE_NODE:
+ c.setScore(((Number) value).doubleValue());
+ break;
+ case ClusteringResponse.CLUSTERS_NODE:
+ ((List<NamedList<Object>>) value)
+ .forEach(
+ sub -> {
+ c.addCluster(toCluster(sub, idToDoc));
+ });
+ break;
+ case ClusteringResponse.IS_OTHER_TOPICS:
+ // Just ignore the attribute.
+ break;
+ default:
+ throw new RuntimeException(
+ "Unknown output property " + key + " in cluster: " +
v.jsonStr());
+ }
+ });
return c;
}
}
diff --git
a/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/EchoClusteringAlgorithm.java
b/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/EchoClusteringAlgorithm.java
index 459f7fa..51448a0 100644
---
a/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/EchoClusteringAlgorithm.java
+++
b/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/EchoClusteringAlgorithm.java
@@ -16,23 +16,20 @@
*/
package org.apache.solr.handler.clustering;
-import org.carrot2.attrs.AttrComposite;
-import org.carrot2.clustering.Cluster;
-import org.carrot2.clustering.ClusteringAlgorithm;
-import org.carrot2.clustering.Document;
-import org.carrot2.language.LanguageComponents;
-
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.stream.Stream;
+import org.carrot2.attrs.AttrComposite;
+import org.carrot2.clustering.Cluster;
+import org.carrot2.clustering.ClusteringAlgorithm;
+import org.carrot2.clustering.Document;
+import org.carrot2.language.LanguageComponents;
/**
- * Test-only pseudo clustering algorithm that creates
- * a cluster for each input document and sets the labels
- * of this cluster to the full content of clustered input
- * fields.
+ * Test-only pseudo clustering algorithm that creates a cluster for each input
document and sets the
+ * labels of this cluster to the full content of clustered input fields.
*/
public class EchoClusteringAlgorithm extends AttrComposite implements
ClusteringAlgorithm {
@Override
@@ -46,16 +43,19 @@ public class EchoClusteringAlgorithm extends AttrComposite
implements Clustering
}
@Override
- public <T extends Document> List<Cluster<T>> cluster(Stream<? extends T>
documentStream, LanguageComponents languageComponents) {
+ public <T extends Document> List<Cluster<T>> cluster(
+ Stream<? extends T> documentStream, LanguageComponents
languageComponents) {
List<Cluster<T>> clusters = new ArrayList<>();
- documentStream.forEach(document -> {
- final Cluster<T> cluster = new Cluster<>();
- cluster.addDocument(document);
- document.visitFields((field, value) -> {
- cluster.addLabel(field + ":" + value);
- });
- clusters.add(cluster);
- });
+ documentStream.forEach(
+ document -> {
+ final Cluster<T> cluster = new Cluster<>();
+ cluster.addDocument(document);
+ document.visitFields(
+ (field, value) -> {
+ cluster.addLabel(field + ":" + value);
+ });
+ clusters.add(cluster);
+ });
return clusters;
}
diff --git
a/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/EchoClusteringAlgorithmProvider.java
b/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/EchoClusteringAlgorithmProvider.java
index 030a325..b8684a5 100644
---
a/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/EchoClusteringAlgorithmProvider.java
+++
b/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/EchoClusteringAlgorithmProvider.java
@@ -18,9 +18,7 @@ package org.apache.solr.handler.clustering;
import org.carrot2.clustering.ClusteringAlgorithmProvider;
-/**
- * SPI provider of {@link EchoClusteringAlgorithm}.
- */
+/** SPI provider of {@link EchoClusteringAlgorithm}. */
public class EchoClusteringAlgorithmProvider implements
ClusteringAlgorithmProvider {
@Override
public String name() {
diff --git
a/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/MockClusteringAlgorithm.java
b/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/MockClusteringAlgorithm.java
index 0863a81..864f5f7 100644
---
a/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/MockClusteringAlgorithm.java
+++
b/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/MockClusteringAlgorithm.java
@@ -16,13 +16,6 @@
*/
package org.apache.solr.handler.clustering;
-import org.carrot2.attrs.AttrComposite;
-import org.carrot2.attrs.AttrInteger;
-import org.carrot2.clustering.Cluster;
-import org.carrot2.clustering.ClusteringAlgorithm;
-import org.carrot2.clustering.Document;
-import org.carrot2.language.LanguageComponents;
-
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
@@ -31,15 +24,20 @@ import java.util.Set;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
+import org.carrot2.attrs.AttrComposite;
+import org.carrot2.attrs.AttrInteger;
+import org.carrot2.clustering.Cluster;
+import org.carrot2.clustering.ClusteringAlgorithm;
+import org.carrot2.clustering.Document;
+import org.carrot2.language.LanguageComponents;
-/**
- * Creates a stable set of synthetic clusters based on the provided parameters.
- */
+/** Creates a stable set of synthetic clusters based on the provided
parameters. */
public class MockClusteringAlgorithm extends AttrComposite implements
ClusteringAlgorithm {
public AttrInteger docsInCluster =
attributes.register(
"docsInCluster",
- AttrInteger.builder().label("Number of documents in each cluster.")
+ AttrInteger.builder()
+ .label("Number of documents in each cluster.")
.min(1)
.max(5)
.defaultValue(3));
@@ -47,7 +45,8 @@ public class MockClusteringAlgorithm extends AttrComposite
implements Clustering
public AttrInteger hierarchyDepth =
attributes.register(
"hierarchyDepth",
- AttrInteger.builder().label("Levels of clusters hierarchy.")
+ AttrInteger.builder()
+ .label("Levels of clusters hierarchy.")
.min(1)
.max(3)
.defaultValue(2));
@@ -55,7 +54,8 @@ public class MockClusteringAlgorithm extends AttrComposite
implements Clustering
public AttrInteger maxClusters =
attributes.register(
"maxClusters",
- AttrInteger.builder().label("Maximum number of clusters at each
hierarchy level.")
+ AttrInteger.builder()
+ .label("Maximum number of clusters at each hierarchy level.")
.min(2)
.max(100)
.defaultValue(3));
@@ -63,7 +63,8 @@ public class MockClusteringAlgorithm extends AttrComposite
implements Clustering
public AttrInteger labelsPerCluster =
attributes.register(
"labelsPerCluster",
- AttrInteger.builder().label("Number of labels generated for each
cluster.")
+ AttrInteger.builder()
+ .label("Number of labels generated for each cluster.")
.min(1)
.max(5)
.defaultValue(1));
@@ -79,30 +80,31 @@ public class MockClusteringAlgorithm extends AttrComposite
implements Clustering
}
@Override
- public <T extends Document> List<Cluster<T>> cluster(Stream<? extends T>
documentStream,
- LanguageComponents
languageComponents) {
+ public <T extends Document> List<Cluster<T>> cluster(
+ Stream<? extends T> documentStream, LanguageComponents
languageComponents) {
List<T> documents = documentStream.collect(Collectors.toList());
if (docsInCluster.get() > documents.size()) {
throw new AssertionError();
}
- Supplier<T> docSupplier = new Supplier<>() {
- Iterator<T> i = documents.iterator();
+ Supplier<T> docSupplier =
+ new Supplier<>() {
+ Iterator<T> i = documents.iterator();
- @Override
- public T get() {
- if (!i.hasNext()) {
- i = documents.iterator();
- }
- return i.next();
- }
- };
+ @Override
+ public T get() {
+ if (!i.hasNext()) {
+ i = documents.iterator();
+ }
+ return i.next();
+ }
+ };
return createClusters(hierarchyDepth.get(), "Cluster ", docSupplier);
}
- private <T extends Document> List<Cluster<T>> createClusters(int level,
String prefix,
- Supplier<T>
docSupplier) {
+ private <T extends Document> List<Cluster<T>> createClusters(
+ int level, String prefix, Supplier<T> docSupplier) {
ArrayList<Cluster<T>> clusters = new ArrayList<>();
for (int count = maxClusters.get(), idx = 1; count > 0; count--, idx++) {
String label = prefix + (prefix.endsWith(" ") ? "" : ".") + idx;
diff --git
a/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/ResourceCheckAlgorithm.java
b/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/ResourceCheckAlgorithm.java
index 8f03610..a7bc78c 100644
---
a/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/ResourceCheckAlgorithm.java
+++
b/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/ResourceCheckAlgorithm.java
@@ -16,14 +16,6 @@
*/
package org.apache.solr.handler.clustering;
-import org.carrot2.attrs.AttrComposite;
-import org.carrot2.attrs.AttrString;
-import org.carrot2.clustering.Cluster;
-import org.carrot2.clustering.ClusteringAlgorithm;
-import org.carrot2.clustering.Document;
-import org.carrot2.language.LanguageComponents;
-import org.carrot2.language.LexicalData;
-
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -31,17 +23,22 @@ import java.util.Locale;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
+import org.carrot2.attrs.AttrComposite;
+import org.carrot2.attrs.AttrString;
+import org.carrot2.clustering.Cluster;
+import org.carrot2.clustering.ClusteringAlgorithm;
+import org.carrot2.clustering.Document;
+import org.carrot2.language.LanguageComponents;
+import org.carrot2.language.LexicalData;
/**
- * Creates synthetic clusters with diagnostics of
- * {@link LanguageComponents} passed to the clustering method.
+ * Creates synthetic clusters with diagnostics of {@link LanguageComponents}
passed to the
+ * clustering method.
*/
class ResourceCheckAlgorithm extends AttrComposite implements
ClusteringAlgorithm {
public AttrString text =
attributes.register(
- "text",
- AttrString.builder().label("Input text to analyze.")
- .defaultValue(null));
+ "text", AttrString.builder().label("Input text to
analyze.").defaultValue(null));
@Override
public Set<Class<?>> requiredLanguageComponents() {
@@ -49,8 +46,8 @@ class ResourceCheckAlgorithm extends AttrComposite implements
ClusteringAlgorith
}
@Override
- public <T extends Document> List<Cluster<T>> cluster(Stream<? extends T>
documentStream,
- LanguageComponents
languageComponents) {
+ public <T extends Document> List<Cluster<T>> cluster(
+ Stream<? extends T> documentStream, LanguageComponents
languageComponents) {
ArrayList<Cluster<T>> clusters = new ArrayList<>();
Cluster<T> cluster = new Cluster<>();
@@ -61,13 +58,17 @@ class ResourceCheckAlgorithm extends AttrComposite
implements ClusteringAlgorith
clusters.add(cluster);
LexicalData lexicalData = languageComponents.get(LexicalData.class);
- cluster.addLabel(Arrays.stream(text.get().trim().split("[\\s]+"))
- .map(term -> String.format(Locale.ROOT,
- "%s[%s, %s]",
- term,
- lexicalData.ignoreWord(term) ? "ignoredWord" : "-",
- lexicalData.ignoreLabel(term) ? "ignoredLabel" : "-"))
- .collect(Collectors.joining(" ")));
+ cluster.addLabel(
+ Arrays.stream(text.get().trim().split("[\\s]+"))
+ .map(
+ term ->
+ String.format(
+ Locale.ROOT,
+ "%s[%s, %s]",
+ term,
+ lexicalData.ignoreWord(term) ? "ignoredWord" : "-",
+ lexicalData.ignoreLabel(term) ? "ignoredLabel" : "-"))
+ .collect(Collectors.joining(" ")));
return clusters;
}
diff --git
a/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/SampleData.java
b/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/SampleData.java
index 0fa8507..2ecb72f 100644
---
a/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/SampleData.java
+++
b/solr/modules/clustering/src/test/org/apache/solr/handler/clustering/SampleData.java
@@ -16,131 +16,129 @@
*/
package org.apache.solr.handler.clustering;
-/**
- * Sample data for tests.
- */
+/** Sample data for tests. */
final class SampleData {
static final String[][] SAMPLE_DOCUMENTS =
- new String[][]{
- {
- "Data Mining - Wikipedia",
- "Article about knowledge-discovery in databases (KDD), the
practice of automatically searching large stores of data for patterns."
- },
- {
- "Data mining - Wikipedia, the free encyclopedia",
- "Data mining is the entire process of applying computer-based
methodology, ... Moreover, some data-mining systems such as neural networks are
inherently geared ..."
- },
- {
- "Electronic Statistics Textbook: Data Mining Techniques",
- "Outlines the crucial concepts in data mining, defines the data
warehousing process, and offers examples of computational and graphical
exploratory data analysis techniques."
- },
- {
- "An Introduction to Data Mining",
- "Data mining, the extraction of hidden predictive information
from large ... Data mining tools predict future trends and behaviors, allowing
businesses to ..."
- },
- {
- "Data Mining: What is Data Mining?",
- "Outlines what knowledge discovery, the process of analyzing
data from different perspectives and summarizing it into useful information,
can do and how it works."
- },
- {
- "Data Mining Software, Data Mining Applications and Data Mining
Solutions",
- "The patterns uncovered using data mining help organizations
make better and ... data mining customer ... Data mining applications, on the
other hand, embed ..."
- },
- {
- "KD Nuggets",
- "Newsletter on the data mining and knowledge industries,
offering information on data mining, knowledge discovery, text mining, and web
mining software, courses, jobs, publications, and meetings."
- },
- {
- "data mining: Definition from Answers.com",
- "data mining n. The automatic extraction of useful, often
previously unknown information from large databases or data ... Data Mining For
Investing ..."
- },
- {
- "STATISTICA Data Mining and Predictive Modeling Solutions",
- "GRC site-wide menuing system research and development. ...
Contact a Data Mining Solutions Consultant. News and Success Stories. Events
..."
- },
- {
- "Data Mining: Text Mining, Visualization and Social Media",
- "Commentary on text mining, data mining, social media and data
visualization. ... While mining Twitter data for business and marketing
intelligence (trend/buzz ..."
- },
- {
- "Two Crows Corporation",
- "Dedicated to the development, marketing, sales and support of
tools for knowledge discovery to make data mining accessible and easy to use."
- },
- {
- "Thearling.com",
- "Kurt Thearling's site dedicated to sharing information about
data mining, the automated extraction of hidden predictive information from
databases, and other analytic technologies."
- },
- {
- "CCSU - Data Mining",
- "Offers degrees and certificates in data mining. Allows students
to explore cutting-edge data mining techniques and applications: market basket
analysis, decision trees, neural networks, machine learning, web mining, and
data modeling."
- },
- {
- "Oracle Data Mining",
- "Oracle Data Mining Product Center ... New Oracle Data Mining
Powers New Social CRM Application (more information ... Mining High-Dimensional
Data for ..."
- },
- {
- "Data Mining: An Introduction",
- "About.com article on how businesses are discovering new trends
and patterns of behavior that previously went unnoticed through data mining,
automated statistical analysis techniques."
- },
- {
- "Open Directory - Computers: Software: Databases: Data Mining",
- "Data Mining and Knowledge Discovery - A peer-reviewed journal
publishing ... Data mining creates information assets that an organization can
leverage to ..."
- },
- {
- "DMI:Data Mining Institute",
- "Data Mining Institute at UW-Madison ... The Data Mining
Institute (DMI) was started on June 1, 1999 at the Computer ... of the Data
Mining Group of Microsoft ..."
- },
- {
- "The Data Mine",
- "Provides information about data mining also known as knowledge
discovery in databases (KDD) or simply knowledge discovery. List software,
events, organizations, and people working in data mining."
- },
- {
- "St@tServ - About Data Mining",
- "St@tServ Data Mining page ... Data mining in molecular biology,
by Alvis Brazma. Graham Williams page. Knowledge Discovery and Data Mining
Resources, ..."
- },
- {
- "MIT OpenCourseWare | Sloan School of Management | 15.062 Data
Mining ...",
- "Introduces students to a class of methods known as data mining
that assists managers in recognizing patterns and making intelligent use of
massive amounts of ..."
- },
- {
- "Pentaho Commercial Open Source Business Intelligence: Data
Mining",
- "For example, data mining can warn you there's a high
probability a specific ... Pentaho Data Mining is differentiated by its open,
standards-compliant nature, ..."
- },
- {
- "Investor Home - Data Mining",
- "Data Mining or Data Snooping is the practice of searching for
relationships and ... Data mining involves searching through databases for
correlations and patterns ..."
- },
- {
- "Predictive Modeling and Predictive Analytics Solutions |
Enterprise ...",
- "Insightful Enterprise Miner - Enterprise data mining for
predictive modeling and predictive analytics."
- },
- {
- "Data mining - SourceWatch",
- "These agencies reported 199 data mining projects, of which 68
... Office, \"DATA MINING. ... powerful technology known as data mining -- and
how, in the ..."
- },
- {
- "Statistical Data Mining Tutorials",
- "Includes a set of tutorials on many aspects of statistical data
mining, including the foundations of probability, the foundations of
statistical data analysis, and most of the classic machine learning and data
mining algorithms."
- },
- {
- "Data Mining",
- "With MicroStrategy, data mining scoring is fully integrated
into mainstream ... The integration of data mining models from other
applications is accomplished by ..."
- },
- {
- "Elder Research",
- "Provides consulting and short courses in data mining and
pattern discovery patterns in data."
- },
- {
- "SQL Server Data Mining > Home",
- "SQL Server Data Mining Portal ... Data Mining as an Application
Platform (Whitepaper) Creating a Web Cross-sell Application with SQL Server
2005 Data Mining (Article) ..."
- },
- {
- "Data Mining",
- "What is data mining? Find out here! ... Book Review: Data
Mining and Statistical Analysis Using SQL. What is Data Mining, and What Does
it Have to Do with ..."
- },
- {
- "Data Mining Software and Text Mining | SAS",
- "... raw data to smarter ... Data Mining is an iterative process
of creating ... The knowledge gleaned from data and text mining can be used to
fuel ..."
- }
+ new String[][] {
+ {
+ "Data Mining - Wikipedia",
+ "Article about knowledge-discovery in databases (KDD), the practice
of automatically searching large stores of data for patterns."
+ },
+ {
+ "Data mining - Wikipedia, the free encyclopedia",
+ "Data mining is the entire process of applying computer-based
methodology, ... Moreover, some data-mining systems such as neural networks are
inherently geared ..."
+ },
+ {
+ "Electronic Statistics Textbook: Data Mining Techniques",
+ "Outlines the crucial concepts in data mining, defines the data
warehousing process, and offers examples of computational and graphical
exploratory data analysis techniques."
+ },
+ {
+ "An Introduction to Data Mining",
+ "Data mining, the extraction of hidden predictive information from
large ... Data mining tools predict future trends and behaviors, allowing
businesses to ..."
+ },
+ {
+ "Data Mining: What is Data Mining?",
+ "Outlines what knowledge discovery, the process of analyzing data
from different perspectives and summarizing it into useful information, can do
and how it works."
+ },
+ {
+ "Data Mining Software, Data Mining Applications and Data Mining
Solutions",
+ "The patterns uncovered using data mining help organizations make
better and ... data mining customer ... Data mining applications, on the other
hand, embed ..."
+ },
+ {
+ "KD Nuggets",
+ "Newsletter on the data mining and knowledge industries, offering
information on data mining, knowledge discovery, text mining, and web mining
software, courses, jobs, publications, and meetings."
+ },
+ {
+ "data mining: Definition from Answers.com",
+ "data mining n. The automatic extraction of useful, often previously
unknown information from large databases or data ... Data Mining For Investing
..."
+ },
+ {
+ "STATISTICA Data Mining and Predictive Modeling Solutions",
+ "GRC site-wide menuing system research and development. ... Contact
a Data Mining Solutions Consultant. News and Success Stories. Events ..."
+ },
+ {
+ "Data Mining: Text Mining, Visualization and Social Media",
+ "Commentary on text mining, data mining, social media and data
visualization. ... While mining Twitter data for business and marketing
intelligence (trend/buzz ..."
+ },
+ {
+ "Two Crows Corporation",
+ "Dedicated to the development, marketing, sales and support of tools
for knowledge discovery to make data mining accessible and easy to use."
+ },
+ {
+ "Thearling.com",
+ "Kurt Thearling's site dedicated to sharing information about data
mining, the automated extraction of hidden predictive information from
databases, and other analytic technologies."
+ },
+ {
+ "CCSU - Data Mining",
+ "Offers degrees and certificates in data mining. Allows students to
explore cutting-edge data mining techniques and applications: market basket
analysis, decision trees, neural networks, machine learning, web mining, and
data modeling."
+ },
+ {
+ "Oracle Data Mining",
+ "Oracle Data Mining Product Center ... New Oracle Data Mining Powers
New Social CRM Application (more information ... Mining High-Dimensional Data
for ..."
+ },
+ {
+ "Data Mining: An Introduction",
+ "About.com article on how businesses are discovering new trends and
patterns of behavior that previously went unnoticed through data mining,
automated statistical analysis techniques."
+ },
+ {
+ "Open Directory - Computers: Software: Databases: Data Mining",
+ "Data Mining and Knowledge Discovery - A peer-reviewed journal
publishing ... Data mining creates information assets that an organization can
leverage to ..."
+ },
+ {
+ "DMI:Data Mining Institute",
+ "Data Mining Institute at UW-Madison ... The Data Mining Institute
(DMI) was started on June 1, 1999 at the Computer ... of the Data Mining Group
of Microsoft ..."
+ },
+ {
+ "The Data Mine",
+ "Provides information about data mining also known as knowledge
discovery in databases (KDD) or simply knowledge discovery. List software,
events, organizations, and people working in data mining."
+ },
+ {
+ "St@tServ - About Data Mining",
+ "St@tServ Data Mining page ... Data mining in molecular biology, by
Alvis Brazma. Graham Williams page. Knowledge Discovery and Data Mining
Resources, ..."
+ },
+ {
+ "MIT OpenCourseWare | Sloan School of Management | 15.062 Data
Mining ...",
+ "Introduces students to a class of methods known as data mining that
assists managers in recognizing patterns and making intelligent use of massive
amounts of ..."
+ },
+ {
+ "Pentaho Commercial Open Source Business Intelligence: Data Mining",
+ "For example, data mining can warn you there's a high probability a
specific ... Pentaho Data Mining is differentiated by its open,
standards-compliant nature, ..."
+ },
+ {
+ "Investor Home - Data Mining",
+ "Data Mining or Data Snooping is the practice of searching for
relationships and ... Data mining involves searching through databases for
correlations and patterns ..."
+ },
+ {
+ "Predictive Modeling and Predictive Analytics Solutions | Enterprise
...",
+ "Insightful Enterprise Miner - Enterprise data mining for predictive
modeling and predictive analytics."
+ },
+ {
+ "Data mining - SourceWatch",
+ "These agencies reported 199 data mining projects, of which 68 ...
Office, \"DATA MINING. ... powerful technology known as data mining -- and how,
in the ..."
+ },
+ {
+ "Statistical Data Mining Tutorials",
+ "Includes a set of tutorials on many aspects of statistical data
mining, including the foundations of probability, the foundations of
statistical data analysis, and most of the classic machine learning and data
mining algorithms."
+ },
+ {
+ "Data Mining",
+ "With MicroStrategy, data mining scoring is fully integrated into
mainstream ... The integration of data mining models from other applications is
accomplished by ..."
+ },
+ {
+ "Elder Research",
+ "Provides consulting and short courses in data mining and pattern
discovery patterns in data."
+ },
+ {
+ "SQL Server Data Mining > Home",
+ "SQL Server Data Mining Portal ... Data Mining as an Application
Platform (Whitepaper) Creating a Web Cross-sell Application with SQL Server
2005 Data Mining (Article) ..."
+ },
+ {
+ "Data Mining",
+ "What is data mining? Find out here! ... Book Review: Data Mining
and Statistical Analysis Using SQL. What is Data Mining, and What Does it Have
to Do with ..."
+ },
+ {
+ "Data Mining Software and Text Mining | SAS",
+ "... raw data to smarter ... Data Mining is an iterative process of
creating ... The knowledge gleaned from data and text mining can be used to
fuel ..."
+ }
};
}