This is an automated email from the ASF dual-hosted git repository. thomasm pushed a commit to branch OAK-11504 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit 624d4381652a6ac136791da3946c35d194b93a84 Author: Thomas Mueller <[email protected]> AuthorDate: Wed Feb 19 14:06:55 2025 +0100 OAK-11504 Elasticsearch: support flattened fields --- .../index/elastic/ElasticIndexDefinition.java | 23 +++ .../index/elastic/ElasticPropertyDefinition.java | 175 ++++++++++++--------- .../index/elastic/index/ElasticDocument.java | 16 ++ .../index/elastic/index/ElasticDocumentMaker.java | 14 +- .../index/elastic/index/ElasticIndexHelper.java | 8 + .../index/elastic/query/ElasticRequestHandler.java | 11 +- .../elastic/ElasticRegexPropertyIndexTest.java | 62 ++++++++ .../oak/plugins/index/search/FieldNames.java | 5 + .../oak/plugins/index/search/IndexDefinition.java | 6 + 9 files changed, 234 insertions(+), 86 deletions(-) diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java index 53e1f3c034..9d69551d1c 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java @@ -31,6 +31,7 @@ import java.util.stream.Stream; import org.apache.jackrabbit.oak.api.Type; import org.apache.jackrabbit.oak.commons.collections.StreamUtils; +import org.apache.jackrabbit.oak.plugins.index.search.FieldNames; import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants; import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition; import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition; @@ -320,6 +321,12 @@ public class ElasticIndexDefinition extends IndexDefinition { if (propertyDefinitions == null) { // if there are no property definitions we return the default keyword name // this can happen for properties that were not explicitly defined (eg: created with a regex) + ElasticPropertyDefinition pd = getMatchingRegexPropertyDefinition(propertyName); + if (pd != null) { + if (pd.isFlattened()) { + return FieldNames.FLATTENED_FIELD_PREFIX + pd.nodeName + "." + propertyName; + } + } return propertyName + ".keyword"; } @@ -332,6 +339,22 @@ public class ElasticIndexDefinition extends IndexDefinition { return field; } + /** + * Try to get the matching regular expression property definition, if any + * + * @param propertyName the property name (may not be null) + * @return the property definition, or null if not found + */ + private ElasticPropertyDefinition getMatchingRegexPropertyDefinition(String propertyName) { + for (IndexingRule rule : getDefinedRules()) { + PropertyDefinition pd = rule.getConfig(propertyName); + if (pd != null && pd.isRegexp) { + return (ElasticPropertyDefinition) pd; + } + } + return null; + } + public boolean isAnalyzed(List<PropertyDefinition> propertyDefinitions) { return propertyDefinitions.stream().anyMatch(pd -> pd.analyzed); } diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java index f38191b307..df5ee128e6 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java @@ -24,96 +24,119 @@ import org.apache.jackrabbit.oak.spi.state.NodeState; public class ElasticPropertyDefinition extends PropertyDefinition { - public static final String DEFAULT_SIMILARITY_METRIC = "l2_norm"; - static final String PROP_SIMILARITY_METRIC = "similarityMetric"; - private static final String PROP_SIMILARITY = "similarity"; - private static final String PROP_K = "k"; - private static final String PROP_CANDIDATES = "candidates"; - private static final float DEFAULT_SIMILARITY = 0.95f; - private static final int DEFAULT_K = 10; - private static final int DEFAULT_CANDIDATES = 500; - private KnnSearchParameters knnSearchParameters; - - /** - * Whether to use dynamic boosted values in full text queries, default is true - */ - private static final String PROP_USE_IN_FULL_TEXT_QUERY = "useInFullTextQuery"; - private final boolean useInFullTextQuery; - - public ElasticPropertyDefinition(IndexDefinition.IndexingRule idxDefn, String nodeName, NodeState defn) { - super(idxDefn, nodeName, defn); - if (this.useInSimilarity) { - knnSearchParameters = new KnnSearchParameters( - getOptionalValue(defn, PROP_SIMILARITY_METRIC, DEFAULT_SIMILARITY_METRIC), - getOptionalValue(defn, PROP_SIMILARITY, DEFAULT_SIMILARITY), - getOptionalValue(defn, PROP_K, DEFAULT_K), - getOptionalValue(defn, PROP_CANDIDATES, DEFAULT_CANDIDATES)); + public static final String DEFAULT_SIMILARITY_METRIC = "l2_norm"; + static final String PROP_SIMILARITY_METRIC = "similarityMetric"; + private static final String PROP_SIMILARITY = "similarity"; + private static final String PROP_K = "k"; + private static final String PROP_CANDIDATES = "candidates"; + private static final float DEFAULT_SIMILARITY = 0.95f; + private static final int DEFAULT_K = 10; + private static final int DEFAULT_CANDIDATES = 500; + private KnnSearchParameters knnSearchParameters; + + /** + * Whether to use dynamic boosted values in full text queries, default is true + */ + private static final String PROP_USE_IN_FULL_TEXT_QUERY = "useInFullTextQuery"; + private final boolean useInFullTextQuery; + + /** + * Whether regex properties are flattened (using the "flattened" field type) + */ + public static final String PROP_IS_FLATTENED = "isFlattened"; + private final boolean isFlattened; + + public ElasticPropertyDefinition(IndexDefinition.IndexingRule idxDefn, String nodeName, NodeState defn) { + super(idxDefn, nodeName, defn); + if (this.useInSimilarity) { + knnSearchParameters = new KnnSearchParameters( + getOptionalValue(defn, PROP_SIMILARITY_METRIC, DEFAULT_SIMILARITY_METRIC), + getOptionalValue(defn, PROP_SIMILARITY, DEFAULT_SIMILARITY), + getOptionalValue(defn, PROP_K, DEFAULT_K), + getOptionalValue(defn, PROP_CANDIDATES, DEFAULT_CANDIDATES)); + } + this.useInFullTextQuery = this.dynamicBoost && getOptionalValue(defn, PROP_USE_IN_FULL_TEXT_QUERY, true); + this.isFlattened = getOptionalValue(defn, PROP_IS_FLATTENED, false); } - this.useInFullTextQuery = this.dynamicBoost && getOptionalValue(defn, PROP_USE_IN_FULL_TEXT_QUERY, true); - } - public KnnSearchParameters getKnnSearchParameters() { - return knnSearchParameters; - } + public KnnSearchParameters getKnnSearchParameters() { + return knnSearchParameters; + } public boolean useInFullTextQuery() { return useInFullTextQuery; } - /** - * Class for defining parameters of approximate knn search on dense_vector fields - * <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html">...</a> and - * <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html">...</a> - */ - public static class KnnSearchParameters { - - public KnnSearchParameters(String similarityMetric, float similarity, int k, int candidates) { - this.similarityMetric = similarityMetric; - this.similarity = similarity; - this.k = k; - this.candidates = candidates; - } - /** - * Similarity metric used to compare query and document vectors. Possible values are l2_norm (default), cosine, - * dot_product, max_inner_product + * Class for defining parameters of approximate knn search on dense_vector fields + * <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html">...</a> and + * <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html">...</a> */ - private final String similarityMetric; - /** - * Minimum similarity for the document vector to be considered as a match. Required when cosine, dot_product - * or max_inner_product is set as similarityMetric - */ - private final float similarity; - /** - * Number of nearest neighbours to return. Must be <= candidates - * vector added as a field - */ - private final int k; + public static class KnnSearchParameters { - /** - * Take the top vectors with the most matching hashes and compute their exact similarity to the query vector. The - * candidates parameter controls the number of exact similarity computations. Specifically, we compute exact - * similarity for the top candidates candidate vectors in each segment. As a reminder, each Elasticsearch index has - * >= 1 shards, and each shard has >= 1 segments. That means if you set "candidates": 200 for an index with 2 - * shards, each with 3 segments, then you’ll compute the exact similarity for 2 * 3 * 200 = 1200 vectors. candidates - * must be set to a number greater or equal to the number of Elasticsearch results you want to get. Higher values - * generally mean higher recall and higher latency. - */ - private final int candidates; + public KnnSearchParameters(String similarityMetric, float similarity, int k, int candidates) { + this.similarityMetric = similarityMetric; + this.similarity = similarity; + this.k = k; + this.candidates = candidates; + } - public String getSimilarityMetric() { - return similarityMetric; - } - public float getSimilarity() { - return similarity; + /** + * Similarity metric used to compare query and document vectors. Possible values are l2_norm (default), cosine, + * dot_product, max_inner_product + */ + private final String similarityMetric; + + /** + * Minimum similarity for the document vector to be considered as a match. Required when cosine, dot_product + * or max_inner_product is set as similarityMetric + */ + private final float similarity; + + /** + * Number of nearest neighbours to return. Must be <= candidates + * vector added as a field + */ + private final int k; + + /** + * Take the top vectors with the most matching hashes and compute their exact similarity to the query vector. The + * candidates parameter controls the number of exact similarity computations. Specifically, we compute exact + * similarity for the top candidates candidate vectors in each segment. As a reminder, each Elasticsearch index has + * >= 1 shards, and each shard has >= 1 segments. That means if you set "candidates": 200 for an index with 2 + * shards, each with 3 segments, then you’ll compute the exact similarity for 2 * 3 * 200 = 1200 vectors. candidates + * must be set to a number greater or equal to the number of Elasticsearch results you want to get. Higher values + * generally mean higher recall and higher latency. + */ + private final int candidates; + + public String getSimilarityMetric() { + return similarityMetric; + } + + public float getSimilarity() { + return similarity; + } + + public int getK() { + return k; + } + + public int getCandidates() { + return candidates; + } } - public int getK() { - return k; + public boolean isFlattened() { + return isFlattened; } - public int getCandidates() { - return candidates; + @Override + public String toString() { + return "ElasticPropertyDefinition{" + super.toString() + + ", useInFullTextQuery=" + useInFullTextQuery + + ", isFlattened=" + isFlattened + + '}'; } - } + } diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java index 1d038835a5..3c7dc6f4f3 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java @@ -195,4 +195,20 @@ public class ElasticDocument { return propertiesToRemove; } + @Override + public String toString() { + StringBuilder buff = new StringBuilder(); + buff.append("path:").append(path).append('\n'); + if (!fulltext.isEmpty()) { + buff.append("fulltext:").append(fulltext).append('\n'); + } + if (!properties.isEmpty()) { + buff.append("properties:").append(properties).append('\n'); + } + if (!dynamicProperties.isEmpty()) { + buff.append("dynamicProperties:").append(dynamicProperties).append('\n'); + } + return buff.toString(); + } + } diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java index 9006fa3861..5316cf1127 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java @@ -23,6 +23,7 @@ import org.apache.jackrabbit.oak.api.PropertyState; import org.apache.jackrabbit.oak.api.Type; import org.apache.jackrabbit.oak.commons.log.LogSilencer; import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition; +import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition; import org.apache.jackrabbit.oak.plugins.index.search.Aggregate; import org.apache.jackrabbit.oak.plugins.index.search.FieldNames; import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition; @@ -169,6 +170,13 @@ public class ElasticDocumentMaker extends FulltextDocumentMaker<ElasticDocument> // If the actual property value is different from the property type defined in the index definition/mapping - this will try to convert the property if possible, // otherwise will log a warning and not try and add the property to index. If we try and index incompatible data types (like String to Date), // we would get an exception while indexing the node on elastic search and other properties for the node will also don't get indexed. (See OAK-9665). + String fieldName = pname; + if (pd.isRegexp) { + ElasticPropertyDefinition epd = (ElasticPropertyDefinition) pd; + if (epd.isFlattened()) { + fieldName = FieldNames.FLATTENED_FIELD_PREFIX + epd.nodeName + "." + pname; + } + } int tag = pd.getType(); Object f; try { @@ -184,12 +192,12 @@ public class ElasticDocumentMaker extends FulltextDocumentMaker<ElasticDocument> f = property.getValue(Type.STRING, i); } - doc.addProperty(pname, f); + doc.addProperty(fieldName, f); } catch (Exception e) { if (!LOG_SILENCER.silence(LOG_KEY_COULD_NOT_CONVERT_PROPERTY)) { LOG.warn( - "[{}] Ignoring property. Could not convert property {} of type {} to type {} for path {}. Error: {}", - getIndexName(), pname, + "[{}] Ignoring property. Could not convert property {} (field {}) of type {} to type {} for path {}. Error: {}", + getIndexName(), pname, fieldName, Type.fromTag(property.getType().tag(), false), Type.fromTag(tag, false), path, e.toString()); } diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java index b49022403b..9ebe703854 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java @@ -241,6 +241,14 @@ class ElasticIndexHelper { Type<?> type = null; for (PropertyDefinition pd : propertyDefinitions) { type = Type.fromTag(pd.getType(), false); + if (pd.isRegexp) { + ElasticPropertyDefinition epd = (ElasticPropertyDefinition) pd; + if (epd.isFlattened()) { + Property.Builder pBuilder = new Property.Builder(); + pBuilder.flattened(b2 -> b2.index(true)); + builder.properties(FieldNames.FLATTENED_FIELD_PREFIX + pd.nodeName, pBuilder.build()); + } + } } Property.Builder pBuilder = new Property.Builder(); diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java index 5b842c5830..dd82fd2881 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java @@ -911,7 +911,6 @@ public class ElasticRequestHandler { } final String field = elasticIndexDefinition.getElasticKeyword(propertyName); - Query in; switch (propType) { case PropertyType.DATE: { @@ -932,18 +931,16 @@ public class ElasticRequestHandler { } default: { if (pr.isLike) { - return like(propertyName, pr.first.getValue(Type.STRING)); + in = like(propertyName, pr.first.getValue(Type.STRING)); + } else { + // TODO Confirm that all other types can be treated as string + in = newPropertyRestrictionQuery(field, pr, value -> value.getValue(Type.STRING)); } - - //TODO Confirm that all other types can be treated as string - in = newPropertyRestrictionQuery(field, pr, value -> value.getValue(Type.STRING)); } } - if (in != null) { return in; } - throw new IllegalStateException("PropertyRestriction not handled " + pr + " for index " + defn); } diff --git a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticRegexPropertyIndexTest.java b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticRegexPropertyIndexTest.java new file mode 100644 index 0000000000..9531f23ceb --- /dev/null +++ b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticRegexPropertyIndexTest.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.oak.plugins.index.elastic; + +import static org.hamcrest.CoreMatchers.containsString; +import static org.hamcrest.MatcherAssert.assertThat; + +import java.util.List; + +import org.apache.jackrabbit.oak.api.Tree; +import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants; +import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder; +import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder.PropertyRule; +import org.junit.Test; + +public class ElasticRegexPropertyIndexTest extends ElasticAbstractQueryTest { + + @Test + public void regexProperty() throws Exception { + IndexDefinitionBuilder builder = createIndex("allProperties"); + PropertyRule prop = builder.indexRule("nt:base").property("allProperties"); + prop.getBuilderTree().setProperty(FulltextIndexConstants.PROP_IS_REGEX, true); + prop.getBuilderTree().setProperty(FulltextIndexConstants.PROP_NAME, "^[^\\/]*$"); + prop.nodeScopeIndex(); + prop.getBuilderTree().setProperty(ElasticPropertyDefinition.PROP_IS_FLATTENED, true); + + setIndex("test1", builder); + root.commit(); + + Tree test = root.getTree("/").addChild("test"); + test.addChild("a").setProperty("propa", "foo"); + test.addChild("b").setProperty("propa", "foo"); + test.addChild("c").setProperty("propa", "foo2"); + test.addChild("d").setProperty("propc", "foo"); + test.addChild("e").setProperty("propd", "foo"); + root.commit(); + + String propaQuery = "select [jcr:path] from [nt:base] where [propa] = 'foo'"; + + assertEventually(() -> { + String explain = explain(propaQuery); + assertThat(explain, containsString("elasticsearch:test1")); + assertThat(explain, containsString("[{\"term\":{\"flat:allProperties.propa\":{\"value\":\"foo\"}}}]")); + assertQuery(propaQuery, List.of("/test/a", "/test/b")); + }); + } + +} diff --git a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java index a8d2237702..a86822ba36 100644 --- a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java +++ b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java @@ -84,6 +84,11 @@ public final class FieldNames { */ public static final String ANALYZED_FIELD_PREFIX = "full:"; + /** + * Prefix for all field names that are flattened. + */ + public static final String FLATTENED_FIELD_PREFIX = "flat:"; + /** * Prefix used for storing fulltext of relative node */ diff --git a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinition.java b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinition.java index bdf518a1f4..5a4d621500 100644 --- a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinition.java +++ b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinition.java @@ -1268,6 +1268,12 @@ public class IndexDefinition implements Aggregate.AggregateMapper { return config; } else if (namePatterns.size() > 0) { // check patterns + if (NodeStateUtils.isHidden(propertyName)) { + // hidden properties (eg. ":nodeName") do match the regex, + // and we should probably ignore them; + // but doing so would break "bug compatibility" + // return null; + } for (NamePattern np : namePatterns) { if (np.matches(propertyName)) { return np.getConfig();
