This is an automated email from the ASF dual-hosted git repository. thomasm pushed a commit to branch OAK-10549 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit b27cff21733989c2e483e28ccb4ad911f4ab44d4 Author: Thomas Mueller <[email protected]> AuthorDate: Wed Nov 15 16:55:27 2023 +0100 OAK-10549 Improve performance of facet count at scale (Lucene) --- .../plugins/index/lucene/LucenePropertyIndex.java | 53 ++++ .../oak/plugins/index/lucene/util/FacetHelper.java | 26 +- .../index/lucene/hybrid/ManyFacetsTest.java | 280 +++++++++++++++++++++ .../index/search/spi/query/FulltextIndex.java | 19 ++ .../index/search/spi/query/FulltextIndexTest.java | 13 + 5 files changed, 386 insertions(+), 5 deletions(-) diff --git a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java index 036b112fda..d9819d73bb 100644 --- a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java +++ b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java @@ -209,6 +209,9 @@ public class LucenePropertyIndex extends FulltextIndex { public final static String CACHE_FACET_RESULTS_NAME = "oak.lucene.cacheFacetResults"; private final boolean CACHE_FACET_RESULTS = Boolean.parseBoolean(System.getProperty(CACHE_FACET_RESULTS_NAME, "true")); + public final static String EAGER_FACET_CACHE_FILL_NAME = "oak.lucene.cacheFacetEagerFill"; + private final static boolean EAGER_FACET_CACHE_FILL = + Boolean.parseBoolean(System.getProperty(EAGER_FACET_CACHE_FILL_NAME, "true")); private static boolean FLAG_CACHE_FACET_RESULTS_CHANGE = true; @@ -1644,11 +1647,39 @@ public class LucenePropertyIndex extends FulltextIndex { return cachedResults.get(cacheKey); } LOG.trace("columnName = {} facet Data not present in cache...", columnName); + if (EAGER_FACET_CACHE_FILL) { + fillFacetCache(numberOfFacets); + if (cachedResults.containsKey(cacheKey)) { + LOG.trace("columnName = {} now found"); + return cachedResults.get(cacheKey); + } + LOG.warn("Facet data for {} not found: read using query", cacheKey); + } List<Facet> result = getFacetsUncached(numberOfFacets, columnName); cachedResults.put(cacheKey, result); return result; } + private List<Facet> fillFacetCache(int numberOfFacets) throws IOException { + List<Facet> result = null; + LuceneIndexNode indexNode = index.acquireIndexNode(plan); + try { + IndexSearcher searcher = indexNode.getSearcher(); + Facets facets = FacetHelper.getFacets(searcher, query, plan, config); + if (facets != null) { + List<String> allColumnNames = FacetHelper.getFacetColumnNamesFromPlan(plan); + for (String column : allColumnNames) { + result = getFacetsUncached(facets, numberOfFacets, column); + String cc = column + "/" + numberOfFacets; + cachedResults.put(cc, result); + } + } + } finally { + indexNode.release(); + } + return result; + } + private List<Facet> getFacetsUncached(int numberOfFacets, String columnName) throws IOException { LuceneIndexNode indexNode = index.acquireIndexNode(plan); try { @@ -1677,6 +1708,28 @@ public class LucenePropertyIndex extends FulltextIndex { indexNode.release(); } } + + private List<Facet> getFacetsUncached(Facets facets, int numberOfFacets, String columnName) throws IOException { + String facetFieldName = FulltextIndex.parseFacetField(columnName); + try { + ImmutableList.Builder<Facet> res = new ImmutableList.Builder<>(); + FacetResult topChildren = facets.getTopChildren(numberOfFacets, facetFieldName); + if (topChildren == null) { + return null; + } + for (LabelAndValue lav : topChildren.labelValues) { + res.add(new Facet( + lav.label, lav.value.intValue() + )); + } + return res.build(); + } catch (IllegalArgumentException iae) { + LOG.debug(iae.getMessage(), iae); + LOG.warn("facets for {} not yet indexed: " + iae, facetFieldName); + return null; + } + } + } static class LuceneFacetProvider implements FacetProvider { diff --git a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/FacetHelper.java b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/FacetHelper.java index 13dabce9fb..37eb574d2e 100644 --- a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/FacetHelper.java +++ b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/FacetHelper.java @@ -19,14 +19,17 @@ package org.apache.jackrabbit.oak.plugins.index.lucene.util; import java.io.IOException; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import org.apache.jackrabbit.oak.plugins.index.search.FieldNames; import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition.SecureFacetConfiguration; -import org.apache.jackrabbit.oak.spi.query.QueryConstants; +import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndex; import org.apache.jackrabbit.oak.spi.query.QueryIndex; +import org.apache.jackrabbit.oak.spi.query.QueryIndex.IndexPlan; import org.apache.jackrabbit.oak.spi.state.NodeBuilder; import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; @@ -60,6 +63,23 @@ public class FacetHelper { return new NodeStateFacetsConfig(definition); } + /** + * Get the column names of all the facets from the index plan, if any. + * + * @param plan the plan + * @return a list (possibly empty) + */ + public static List<String> getFacetColumnNamesFromPlan(IndexPlan plan) { + @SuppressWarnings("unchecked") + List<String> facetFields = (List<String>) plan.getAttribute(ATTR_FACET_FIELDS); + if (facetFields == null) { + return Collections.emptyList(); + } + return facetFields.stream().map( + FulltextIndex::convertFacetFieldNameToColumnName). + collect(Collectors.toList()); + } + public static Facets getFacets(IndexSearcher searcher, Query query, QueryIndex.IndexPlan plan, SecureFacetConfiguration secureFacetConfiguration) throws IOException { Facets facets = null; @@ -104,10 +124,6 @@ public class FacetHelper { return facets; } - public static String parseFacetField(String columnName) { - return columnName.substring(QueryConstants.REP_FACET.length() + 1, columnName.length() - 1); - } - private static final Facets NULL_FACETS = new Facets() { @Override public FacetResult getTopChildren(int topN, String dim, String... path) { diff --git a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/hybrid/ManyFacetsTest.java b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/hybrid/ManyFacetsTest.java new file mode 100644 index 0000000000..d98bd40d2b --- /dev/null +++ b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/hybrid/ManyFacetsTest.java @@ -0,0 +1,280 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.oak.plugins.index.lucene.hybrid; + +import static org.apache.jackrabbit.guava.common.util.concurrent.MoreExecutors.newDirectExecutorService; +import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROP_FACETS; +import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.STATISTICAL_FACET_SAMPLE_SIZE_DEFAULT; +import static org.apache.jackrabbit.oak.spi.mount.Mounts.defaultMountInfoProvider; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.File; +import java.io.IOException; +import java.util.Properties; +import java.util.Random; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.function.Predicate; + +import javax.jcr.GuestCredentials; +import javax.jcr.Repository; +import javax.jcr.RepositoryException; +import javax.jcr.Session; +import javax.jcr.query.Query; +import javax.jcr.query.QueryManager; +import javax.jcr.query.QueryResult; +import javax.jcr.query.Row; +import javax.jcr.query.RowIterator; + +import org.apache.jackrabbit.oak.InitialContent; +import org.apache.jackrabbit.oak.Oak; +import org.apache.jackrabbit.oak.api.ContentRepository; +import org.apache.jackrabbit.oak.api.Tree; +import org.apache.jackrabbit.oak.api.Type; +import org.apache.jackrabbit.oak.commons.PathUtils; +import org.apache.jackrabbit.oak.commons.concurrent.ExecutorCloser; +import org.apache.jackrabbit.oak.commons.json.JsonObject; +import org.apache.jackrabbit.oak.jcr.Jcr; +import org.apache.jackrabbit.oak.plugins.index.AsyncIndexUpdate; +import org.apache.jackrabbit.oak.plugins.index.counter.NodeCounterEditorProvider; +import org.apache.jackrabbit.oak.plugins.index.lucene.IndexCopier; +import org.apache.jackrabbit.oak.plugins.index.lucene.IndexTracker; +import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditorProvider; +import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexProvider; +import org.apache.jackrabbit.oak.plugins.index.lucene.LucenePropertyIndex; +import org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil; +import org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.OptionalEditorProvider; +import org.apache.jackrabbit.oak.plugins.index.lucene.reader.DefaultIndexReaderFactory; +import org.apache.jackrabbit.oak.plugins.index.lucene.reader.LuceneIndexReaderFactory; +import org.apache.jackrabbit.oak.plugins.index.lucene.util.LuceneIndexDefinitionBuilder; +import org.apache.jackrabbit.oak.plugins.index.nodetype.NodeTypeIndexProvider; +import org.apache.jackrabbit.oak.plugins.index.property.PropertyIndexEditorProvider; +import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants; +import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder.PropertyRule; +import org.apache.jackrabbit.oak.plugins.memory.MemoryNodeStore; +import org.apache.jackrabbit.oak.query.AbstractQueryTest; +import org.apache.jackrabbit.oak.spi.commit.Observer; +import org.apache.jackrabbit.oak.spi.mount.MountInfoProvider; +import org.apache.jackrabbit.oak.spi.query.QueryIndexProvider; +import org.apache.jackrabbit.oak.spi.security.OpenSecurityProvider; +import org.apache.jackrabbit.oak.spi.state.NodeStore; +import org.apache.jackrabbit.oak.spi.whiteboard.Whiteboard; +import org.apache.jackrabbit.oak.spi.whiteboard.WhiteboardUtils; +import org.apache.jackrabbit.oak.stats.Clock; +import org.apache.jackrabbit.oak.stats.StatisticsProvider; +import org.jetbrains.annotations.Nullable; +import org.junit.After; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class ManyFacetsTest extends AbstractQueryTest { + + @Rule + public TemporaryFolder temporaryFolder = new TemporaryFolder(new File("target")); + + private static final int NUM_LABELS = 4; + private static final int NUM_LEAF_NODES = STATISTICAL_FACET_SAMPLE_SIZE_DEFAULT; + private static final String FACET_PROP = "facets"; + private static final long REFRESH_DELTA = TimeUnit.SECONDS.toMillis(1); + + private static final int FACET_COUNT = 200; + + private ExecutorService executorService = Executors.newFixedThreadPool(2); + private OptionalEditorProvider optionalEditorProvider = new OptionalEditorProvider(); + private NRTIndexFactory nrtIndexFactory; + private LuceneIndexProvider luceneIndexProvider; + private NodeStore nodeStore; + private DocumentQueue queue; + private Clock clock = new Clock.Virtual(); + private Whiteboard wb; + private QueryManager qm; + private Repository jcrRepo; + private Jcr jcr; + private Oak oak; + // backup original system properties i.e. before test started + private final Properties backupProperties = (Properties) System.getProperties().clone(); + + @After + public void tearDown() throws IOException { + luceneIndexProvider.close(); + new ExecutorCloser(executorService).close(); + nrtIndexFactory.close(); + // restore original system properties i.e. before test started + System.setProperties(backupProperties); + } + + @Override + protected ContentRepository createRepository() { + IndexCopier copier; + try { + copier = new IndexCopier(executorService, temporaryFolder.getRoot()); + } catch (IOException e) { + throw new RuntimeException(e); + } + MountInfoProvider mip = defaultMountInfoProvider(); + nrtIndexFactory = new NRTIndexFactory(copier, clock, TimeUnit.MILLISECONDS.toSeconds(REFRESH_DELTA), StatisticsProvider.NOOP); + nrtIndexFactory.setAssertAllResourcesClosed(true); + LuceneIndexReaderFactory indexReaderFactory = new DefaultIndexReaderFactory(mip, copier); + IndexTracker tracker = new IndexTracker(indexReaderFactory, nrtIndexFactory); + luceneIndexProvider = new LuceneIndexProvider(tracker); + queue = new DocumentQueue(100, tracker, newDirectExecutorService()); + LuceneIndexEditorProvider editorProvider = new LuceneIndexEditorProvider(copier, + tracker, + null, + null, + mip); + editorProvider.setIndexingQueue(queue); + LocalIndexObserver localIndexObserver = new LocalIndexObserver(queue, StatisticsProvider.NOOP); + nodeStore = new MemoryNodeStore(); + oak = new Oak(nodeStore) + .with(new InitialContent()) + .with(new OpenSecurityProvider()) + .with((QueryIndexProvider) luceneIndexProvider) + .with((Observer) luceneIndexProvider) + .with(localIndexObserver) + .with(editorProvider) + .with(new PropertyIndexEditorProvider()) + .with(new NodeTypeIndexProvider()) + .with(optionalEditorProvider) + .with(new NodeCounterEditorProvider()) + //Effectively disable async indexing auto run + //such that we can control run timing as per test requirement + .withAsyncIndexing("async", TimeUnit.DAYS.toSeconds(1)); + + wb = oak.getWhiteboard(); + ContentRepository repo = oak.createContentRepository(); + return repo; + } + + private void createSmallDataset(int k) throws RepositoryException { + Random random = new Random(42); + Tree par = createPath("/parent" + k); + par.setProperty("foo", "bar"); + for (int i = 0; i < NUM_LABELS * 2; i++) { + Tree subPar = par.addChild("par" + i); + for (int j = 0; j < NUM_LEAF_NODES / (2 * NUM_LABELS); j++) { + Tree child = subPar.addChild("c" + j); + child.setProperty("cons", "val"); + for (int f = 0; f < FACET_COUNT; f++) { + int labelNum = random.nextInt(NUM_LABELS); + child.setProperty("foo" + f, "foo" + f + "x" + labelNum); + } + } + } + } + + private Tree createPath(String path) { + Tree base = root.getTree("/"); + for (String e : PathUtils.elements(path)) { + base = base.addChild(e); + } + return base; + } + + private void runAsyncIndex() { + AsyncIndexUpdate async = (AsyncIndexUpdate) WhiteboardUtils.getService(wb, Runnable.class, new Predicate<Runnable>() { + @Override + public boolean test(@Nullable Runnable input) { + return input instanceof AsyncIndexUpdate; + } + }); + assertNotNull(async); + async.run(); + if (async.isFailing()) { + fail("AsyncIndexUpdate failed"); + } + root.refresh(); + } + + @Test + public void facet() throws Exception { + // Explicitly setting following configs to run DelayedLuceneFacetProvider and a thread sleep of 50 ms in refresh readers. Refer: OAK-8898 + System.setProperty(LucenePropertyIndex.OLD_FACET_PROVIDER_CONFIG_NAME, "false"); + // The variable is static final so once set it remains same for all tests and which will lead to slow execution + // of other tests as this add a sleep of specified milliseconds in refresh reader method in LuceneIndexNodeManager. + // System.setProperty(LuceneIndexNodeManager.OLD_FACET_PROVIDER_TEST_FAILURE_SLEEP_INSTRUMENT_NAME, "40"); + Thread.currentThread().setName("main"); + String idxName = "hybridtest"; + Tree idx = createIndex(root.getTree("/"), idxName); + TestUtil.enableIndexingMode(idx, FulltextIndexConstants.IndexingMode.NRT); + setTraversalEnabled(false); + root.commit(); + jcr = new Jcr(oak); + jcrRepo = jcr.createRepository(); + createSmallDataset(0); + clock.waitUntil(clock.getTime() + REFRESH_DELTA + 1); + root.commit(); + runAsyncIndex(); + createSmallDataset(2); + clock.waitUntil(clock.getTime() + REFRESH_DELTA + 1); + root.commit(); + Session anonSession = jcrRepo.login(new GuestCredentials()); + qm = anonSession.getWorkspace().getQueryManager(); + String facetList = ""; + for (int i = 0; i < FACET_COUNT; i++) { + if (i > 0) { + facetList += ", "; + } + facetList += "[rep:facet(foo" + i + ")]"; + } + String queryString = "SELECT " + facetList + + " FROM [nt:base] WHERE [cons] = 'val'"; + Query q = qm.createQuery(queryString, SQL2); + QueryResult qr = q.execute(); + try { + RowIterator it = qr.getRows(); + assertTrue(it.hasNext()); + while (it.hasNext()) { + Row r = it.nextRow(); + for (int i = 0; i < qr.getColumnNames().length; i++) { + String columnName = qr.getColumnNames()[i]; + String v = r.getValue(columnName).getString(); + JsonObject json = JsonObject.fromJson(v, true); + for (int j = 0; j < NUM_LABELS; j++) { + String n = json.getProperties().get("foo" + i + "x" + j); + assertTrue(n != null); + } + } + } + } catch (Exception e) { + e.printStackTrace(); + throw e; + } + } + + private Tree createIndex(Tree index, String name) throws RepositoryException { + LuceneIndexDefinitionBuilder idxBuilder = new LuceneIndexDefinitionBuilder(); + PropertyRule pr = idxBuilder.noAsync() + .indexRule("nt:base") + .property("cons").propertyIndex(); + for (int i = 0; i < FACET_COUNT; i++) { + pr.property("foo" + i).propertyIndex().getBuilderTree().setProperty(PROP_FACETS, true); + } + Tree facetConfig = idxBuilder.getBuilderTree().addChild(FACET_PROP); + facetConfig.setProperty("jcr:primaryType", "nt:unstructured", Type.NAME); + facetConfig.setProperty("secure", "statistical"); + facetConfig.setProperty("topChildren", "100"); + Tree idxTree = index.getChild("oak:index").addChild(name); + idxBuilder.build(idxTree); + return idxTree; + } + +} diff --git a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndex.java b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndex.java index 38dd82d953..4b469f56b5 100644 --- a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndex.java +++ b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndex.java @@ -615,7 +615,26 @@ public abstract class FulltextIndex implements AdvancedQueryIndex, QueryIndex, N } } + /** + * Get the facet name from a column name. + * + * This method silently assumes(!) that the column name starts with "rep:facet(" + * and ends with ")". + * + * @param columnName the column name, e.g. "rep:facet(abc)" + * @return the facet name, e.g. "abc" + */ public static String parseFacetField(String columnName) { return columnName.substring(QueryConstants.REP_FACET.length() + 1, columnName.length() - 1); } + + /** + * Convert the facet name to a column name. + * + * @param facetFieldName the facet field name, e.g. "abc" + * @return the column name, e.g. "rep:facet(abc)" + */ + public static String convertFacetFieldNameToColumnName(String facetFieldName) { + return QueryConstants.REP_FACET + "(" + facetFieldName + ")"; + } } diff --git a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndexTest.java b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndexTest.java index 316081b597..17dc3995a5 100644 --- a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndexTest.java +++ b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndexTest.java @@ -59,6 +59,19 @@ public class FulltextIndexTest { assertEquals("jcr:primaryType", field); } + @Test + public void testConvertParseFacetField() { + assertEquals("rep:facet(text)", + FulltextIndex.convertFacetFieldNameToColumnName( + "text")); + assertEquals("rep:facet(jcr:title)", + FulltextIndex.convertFacetFieldNameToColumnName( + "jcr:title")); + assertEquals("rep:facet(jcr:primaryType)", + FulltextIndex.convertFacetFieldNameToColumnName( + "jcr:primaryType")); + } + /** * Test that we can read the rows first, and then read the data from the rows. */
