This is an automated email from the ASF dual-hosted git repository. thomasm pushed a commit to branch OAK-11568 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit 9635838aa471fb4cad5b2e2bd8963dbb62ad4a66 Author: Thomas Mueller <[email protected]> AuthorDate: Wed Mar 19 16:22:28 2025 +0100 OAK-11568 Elastic: improved compatibility for aggregation definitions --- .../index/elastic/ElasticIndexProviderService.java | 69 +--------------------- .../index/elastic/index/ElasticCustomAnalyzer.java | 63 +++++++++++++++++--- .../index/elastic/index/ElasticDocument.java | 2 + .../elastic/index/ElasticIndexEditorContext.java | 2 +- .../index/elastic/ElasticInferenceTest.java | 5 +- .../oak/plugins/index/elastic/ElasticPerfTest.java | 5 +- .../plugins/index/elastic/ElasticTestServer.java | 1 + .../elastic/index/ElasticIndexHelperTest.java | 33 +++++++++++ .../oak/plugins/index/search/util/ConfigUtil.java | 2 +- 9 files changed, 99 insertions(+), 83 deletions(-) diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexProviderService.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexProviderService.java index f2aebf1670..b00b680303 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexProviderService.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexProviderService.java @@ -16,9 +16,6 @@ */ package org.apache.jackrabbit.oak.plugins.index.elastic; -import org.apache.commons.io.FilenameUtils; -import org.apache.jackrabbit.oak.api.jmx.CacheStatsMBean; -import org.apache.jackrabbit.oak.cache.CacheStats; import org.apache.jackrabbit.oak.commons.IOUtils; import org.apache.jackrabbit.oak.osgi.OsgiWhiteboard; import org.apache.jackrabbit.oak.plugins.index.AsyncIndexInfoService; @@ -50,13 +47,11 @@ import org.osgi.service.metatype.annotations.ObjectClassDefinition; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; import java.util.ArrayList; import java.util.Dictionary; import java.util.Hashtable; import java.util.List; -import static org.apache.commons.io.FileUtils.ONE_MB; import static org.apache.jackrabbit.oak.spi.whiteboard.WhiteboardUtils.registerMBean; import static org.apache.jackrabbit.oak.spi.whiteboard.WhiteboardUtils.scheduleWithFixedDelay; @@ -130,8 +125,6 @@ public class ElasticIndexProviderService { private static final Logger LOG = LoggerFactory.getLogger(ElasticIndexProviderService.class); - private static final String REPOSITORY_HOME = "repository.home"; - @Reference private StatisticsProvider statisticsProvider; @@ -149,11 +142,10 @@ public class ElasticIndexProviderService { private ExtractedTextCache extractedTextCache; - private final List<ServiceRegistration> regs = new ArrayList<>(); + private final List<ServiceRegistration<?>> regs = new ArrayList<>(); private final List<Registration> oakRegs = new ArrayList<>(); private Whiteboard whiteboard; - private File textExtractionDir; private ElasticConnection elasticConnection; private ElasticMetricHandler metricHandler; @@ -200,7 +192,7 @@ public class ElasticIndexProviderService { @Deactivate private void deactivate() { - for (ServiceRegistration reg : regs) { + for (ServiceRegistration<?> reg : regs) { reg.unregister(); } @@ -242,63 +234,6 @@ public class ElasticIndexProviderService { Dictionary<String, Object> props = new Hashtable<>(); props.put("type", ElasticIndexDefinition.TYPE_ELASTICSEARCH); regs.add(bundleContext.registerService(IndexEditorProvider.class.getName(), editorProvider, props)); -// oakRegs.add(registerMBean(whiteboard, -// TextExtractionStatsMBean.class, -// editorProvider.getExtractedTextCache().getStatsMBean(), -// TextExtractionStatsMBean.TYPE, -// "TextExtraction statistics")); - } - - private void initializeExtractedTextCache(final Config config, StatisticsProvider statisticsProvider) { - - extractedTextCache = new ExtractedTextCache( - config.extractedTextCacheSizeInMB() * ONE_MB, - config.extractedTextCacheExpiryInSecs(), - config.alwaysUsePreExtractedCache(), - textExtractionDir, - statisticsProvider); - if (extractedTextProvider != null) { - registerExtractedTextProvider(extractedTextProvider); - } - CacheStats stats = extractedTextCache.getCacheStats(); - if (stats != null) { - oakRegs.add(registerMBean(whiteboard, - CacheStatsMBean.class, stats, - CacheStatsMBean.TYPE, stats.getName())); - LOG.info("Extracted text caching enabled with maxSize {} MB, expiry time {} secs", - config.extractedTextCacheSizeInMB(), config.extractedTextCacheExpiryInSecs()); - } - } - - private void initializeTextExtractionDir(BundleContext bundleContext, Config config) { - String textExtractionDir = config.localTextExtractionDir(); - if (textExtractionDir.trim().isEmpty()) { - String repoHome = bundleContext.getProperty(REPOSITORY_HOME); - if (repoHome != null) { - textExtractionDir = FilenameUtils.concat(repoHome, "index"); - } - } - - if (textExtractionDir == null) { - throw new IllegalStateException(String.format("Text extraction directory cannot be determined as neither " + - "directory path [%s] nor repository home [%s] defined", PROP_LOCAL_TEXT_EXTRACTION_DIR, REPOSITORY_HOME)); - } - - this.textExtractionDir = new File(textExtractionDir); - } - - private void registerExtractedTextProvider(PreExtractedTextProvider provider) { - if (extractedTextCache != null) { - if (provider != null) { - String usage = extractedTextCache.isAlwaysUsePreExtractedCache() ? - "always" : "only during reindexing phase"; - LOG.info("Registering PreExtractedTextProvider {} with extracted text cache. " + - "It would be used {}", provider, usage); - } else { - LOG.info("Unregistering PreExtractedTextProvider with extracted text cache"); - } - extractedTextCache.setExtractedTextProvider(provider); - } } private ElasticConnection getElasticConnection(Config contextConfig) { diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzer.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzer.java index 05026f9e20..84d252a04f 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzer.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzer.java @@ -55,6 +55,7 @@ import java.io.Reader; import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -97,7 +98,13 @@ public class ElasticCustomAnalyzer { NodeState defaultAnalyzer = state.getChildNode(FulltextIndexConstants.ANL_DEFAULT); if (defaultAnalyzer.exists()) { IndexSettingsAnalysis.Builder builder = new IndexSettingsAnalysis.Builder(); - Map<String, Object> analyzer = convertNodeState(defaultAnalyzer); + Map<String, Object> analyzer; + try { + analyzer = convertNodeState(defaultAnalyzer); + } catch (IOException e) { + LOG.warn("Can not load analyzer; using an empty configuration", e); + analyzer = Map.of(); + } String builtIn = defaultAnalyzer.getString(FulltextIndexConstants.ANL_CLASS); if (builtIn == null) { builtIn = defaultAnalyzer.getString(FulltextIndexConstants.ANL_NAME); @@ -107,11 +114,14 @@ public class ElasticCustomAnalyzer { // content params, usually stop words for (ChildNodeEntry nodeEntry : defaultAnalyzer.getChildNodeEntries()) { + List<String> list; try { - analyzer.put(normalize(nodeEntry.getName()), loadContent(nodeEntry.getNodeState(), nodeEntry.getName(), NOOP_TRANSFORMATION)); + list = loadContent(nodeEntry.getNodeState(), nodeEntry.getName(), NOOP_TRANSFORMATION); } catch (IOException e) { - throw new IllegalStateException("Unable to load content for node entry " + nodeEntry.getName(), e); + LOG.warn("Unable to load analyzer content for entry '" + nodeEntry.getName() + "'; using empty list", e); + list = List.of(); } + analyzer.put(normalize(nodeEntry.getName()), list); } builder.analyzer(analyzerName, new Analyzer(null, JsonData.of(analyzer))); @@ -145,8 +155,22 @@ public class ElasticCustomAnalyzer { @NotNull private static TokenizerDefinition loadTokenizer(NodeState state) { - String name = normalize(Objects.requireNonNull(state.getString(FulltextIndexConstants.ANL_NAME))); - Map<String, Object> args = convertNodeState(state); + String name; + Map<String, Object> args; + if (!state.exists()) { + LOG.warn("No tokenizer specified; the standard with an empty configuration"); + name = "Standard"; + args = new HashMap<String, Object>(); + } else { + name = Objects.requireNonNull(state.getString(FulltextIndexConstants.ANL_NAME)); + try { + args = convertNodeState(state); + } catch (IOException e) { + LOG.warn("Can not load tokenizer; using an empty configuration", e); + args = new HashMap<String, Object>(); + } + } + name = normalize(name); args.put(ANALYZER_TYPE, name); return new TokenizerDefinition(name, JsonData.of(args)); } @@ -228,7 +252,12 @@ public class ElasticCustomAnalyzer { } private static List<String> loadContent(NodeState file, String name, ContentTransformer transformer) throws IOException { - Blob blob = ConfigUtil.getBlob(file, name); + Blob blob; + try { + blob = ConfigUtil.getBlob(file, name); + } catch (IllegalArgumentException | IllegalStateException e) { + throw new IOException("Could not load " + name, e); + } try (Reader content = new InputStreamReader(Objects.requireNonNull(blob).getNewStream(), StandardCharsets.UTF_8)) { try (BufferedReader br = new BufferedReader(content)) { return br.lines() @@ -264,11 +293,25 @@ public class ElasticCustomAnalyzer { return name; } - private static Map<String, Object> convertNodeState(NodeState state) { - return convertNodeState(state, List.of(), List.of()); + private static Map<String, Object> convertNodeState(NodeState state) throws IOException { + try { + return convertNodeState(state, List.of(), List.of()); + } catch (IllegalStateException e) { + // convert runtime exception back to checked exception + throw new IOException("Can not convert", e); + } } - private static Map<String, Object> convertNodeState(NodeState state, List<ParameterTransformer> transformers, List<String> preloadedContent) { + /** + * Read analyzer configuration. + * + * @param state the node state + * @param transformers + * @param preloadedContent + * @return + * @throws IllegalStateException + */ + private static Map<String, Object> convertNodeState(NodeState state, List<ParameterTransformer> transformers, List<String> preloadedContent) throws IllegalStateException { Map<String, Object> luceneParams = StreamSupport.stream(Spliterators.spliteratorUnknownSize(state.getProperties().iterator(), Spliterator.ORDERED), false) .filter(ElasticCustomAnalyzer::isPropertySupported) .collect(Collectors.toMap(PropertyState::getName, ps -> { @@ -280,6 +323,8 @@ public class ElasticCustomAnalyzer { return loadContent(state.getChildNode(v.trim()), v.trim(), CONTENT_TRANSFORMERS.getOrDefault(ps.getName(), NOOP_TRANSFORMATION)).stream(); } catch (IOException e) { + // convert checked exception to runtime exception to runtime exception, + // because the stream API doesn't support checked exceptions throw new IllegalStateException(e); } }).collect(Collectors.toList())); diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java index 2f1ee7e26e..a7918ec83f 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java @@ -97,6 +97,7 @@ public class ElasticDocument { map -> { Object existingValue = map.get(ElasticIndexHelper.DYNAMIC_PROPERTY_VALUE); if (existingValue instanceof Set) { + @SuppressWarnings("unchecked") Set<Object> existingSet = (Set<Object>) existingValue; existingSet.add(value); } else { @@ -134,6 +135,7 @@ public class ElasticDocument { if (existingValue == null) { finalValue = value; } else if (existingValue instanceof Set) { + @SuppressWarnings("unchecked") Set<Object> existingSet = (Set<Object>) existingValue; existingSet.add(value); finalValue = existingSet; diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexEditorContext.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexEditorContext.java index 330ebe416c..4e7f68cf95 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexEditorContext.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexEditorContext.java @@ -40,7 +40,7 @@ class ElasticIndexEditorContext extends FulltextIndexEditorContext<ElasticDocume } @Override - public IndexDefinition.Builder newDefinitionBuilder() { + public ElasticIndexDefinition.Builder newDefinitionBuilder() { return new ElasticIndexDefinition.Builder(((ElasticIndexDefinition) definition).getIndexPrefix()); } diff --git a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticInferenceTest.java b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticInferenceTest.java index 22a334f344..d0c459f0c5 100644 --- a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticInferenceTest.java +++ b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticInferenceTest.java @@ -145,12 +145,13 @@ public class ElasticInferenceTest extends ElasticAbstractQueryTest { for (String path : paths) { URL json = this.getClass().getResource("/inference" + path + ".json"); if (json != null) { - Map<String, Object> map = mapper.readValue(json, Map.class); + @SuppressWarnings("unchecked") + Map<String, Collection<Double>> map = mapper.readValue(json, Map.class); ObjectNode updateDoc = mapper.createObjectNode(); ObjectNode inferenceNode = updateDoc.putObject(ElasticIndexDefinition.INFERENCE); ArrayNode embeddingsNode = inferenceNode.putObject("embeddings").putArray("value"); inferenceNode.putObject("metadata").put("updatedAt", Instant.now().toEpochMilli()); - for (Double d : (Collection<Double>) map.get("embedding")) { + for (Double d : map.get("embedding")) { embeddingsNode.add(d); } updateDocument(index, path, updateDoc); diff --git a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPerfTest.java b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPerfTest.java index c81412f755..77ef7ce225 100644 --- a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPerfTest.java +++ b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPerfTest.java @@ -167,12 +167,11 @@ public class ElasticPerfTest extends ElasticAbstractQueryTest { private void testQuery(String query, String language) throws Exception { Result result = executeQuery(query, language, NO_BINDINGS); - Iterable<ResultRow> it = (Iterable<ResultRow>) result.getRows(); - Iterator<ResultRow> iterator = it.iterator(); + Iterator<? extends ResultRow> iterator = result.getRows().iterator(); long start = LOG_PERF.startForInfoLog("Getting result rows"); int i = 0; while (iterator.hasNext()) { - ResultRow row = iterator.next(); + iterator.next(); i++; } LOG_PERF.end(start, -1,-1, "{} Results fetched", i); diff --git a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticTestServer.java b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticTestServer.java index 4efcc665e5..cf694cb673 100644 --- a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticTestServer.java +++ b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticTestServer.java @@ -62,6 +62,7 @@ public class ElasticTestServer implements AutoCloseable { return CONTAINER; } + @SuppressWarnings("resource") private synchronized void setup() { String esDockerImageVersion = ELASTIC_DOCKER_IMAGE_VERSION != null ? ELASTIC_DOCKER_IMAGE_VERSION : Version.VERSION.toString(); LOG.info("Elasticsearch test Docker image version: {}.", esDockerImageVersion); diff --git a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelperTest.java b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelperTest.java index 9b7372967e..a32c90dca0 100644 --- a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelperTest.java +++ b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelperTest.java @@ -95,6 +95,39 @@ public class ElasticIndexHelperTest { ElasticIndexHelper.createIndexRequest("prefix.path", definition); } + @Test + public void analyzerWithEmptyTokenizer() { + IndexDefinitionBuilder builder = new ElasticIndexDefinitionBuilder(); + IndexDefinitionBuilder.IndexRule indexRule = builder.indexRule("idxRule"); + indexRule.property("foo").type("String").useInSimilarity(); + + Tree analyzer = builder.getBuilderTree().addChild("analyzers"); + Tree defaultAnalyzer = analyzer.addChild("default"); + defaultAnalyzer.setProperty(FulltextIndexConstants.ANL_CLASS, "org.apache.lucene.analysis.en.EnglishAnalyzer"); + defaultAnalyzer.addChild("tokenizer"); + defaultAnalyzer.addChild("filter"); + + NodeState nodeState = builder.build(); + ElasticIndexDefinition definition = + new ElasticIndexDefinition(nodeState, nodeState, "path", "prefix"); + ElasticIndexHelper.createIndexRequest("prefix.path", definition); + } + + @Test + public void analyzerWithEmptyDefault() { + IndexDefinitionBuilder builder = new ElasticIndexDefinitionBuilder(); + IndexDefinitionBuilder.IndexRule indexRule = builder.indexRule("idxRule"); + indexRule.property("foo").type("String").useInSimilarity(); + + Tree analyzer = builder.getBuilderTree().addChild("analyzers"); + analyzer.addChild("default"); + + NodeState nodeState = builder.build(); + ElasticIndexDefinition definition = + new ElasticIndexDefinition(nodeState, nodeState, "path", "prefix"); + ElasticIndexHelper.createIndexRequest("prefix.path", definition); + } + @Test() public void indexSettingsAreCorrectlySet() { IndexDefinitionBuilder builder = new ElasticIndexDefinitionBuilder(); diff --git a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/ConfigUtil.java b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/ConfigUtil.java index 9b5c3d0491..517e055d7b 100644 --- a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/ConfigUtil.java +++ b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/ConfigUtil.java @@ -110,7 +110,7 @@ public class ConfigUtil { * the jcr:content/@jcr:data property to get the binary content */ @Nullable - public static Blob getBlob(NodeState state, String resourceName){ + public static Blob getBlob(NodeState state, String resourceName) { NodeState contentNode = state.getChildNode(JcrConstants.JCR_CONTENT); checkArgument(contentNode.exists(), "Was expecting to find jcr:content node to read resource %s", resourceName); PropertyState property = contentNode.getProperty(JcrConstants.JCR_DATA);
