thomasmueller commented on code in PR #2689: URL: https://github.com/apache/jackrabbit-oak/pull/2689#discussion_r2773681515
########## oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/index/diff/DiffIndexMerger.java: ########## @@ -0,0 +1,833 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.oak.plugins.index.diff; + +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Map.Entry; +import java.util.TreeMap; + +import org.apache.jackrabbit.oak.commons.StringUtils; +import org.apache.jackrabbit.oak.commons.json.JsonObject; +import org.apache.jackrabbit.oak.commons.json.JsopBuilder; +import org.apache.jackrabbit.oak.commons.json.JsopTokenizer; +import org.apache.jackrabbit.oak.json.Base64BlobSerializer; +import org.apache.jackrabbit.oak.json.JsonSerializer; +import org.apache.jackrabbit.oak.plugins.index.IndexName; +import org.apache.jackrabbit.oak.spi.state.NodeState; +import org.apache.jackrabbit.oak.spi.state.NodeStateUtils; +import org.apache.jackrabbit.oak.spi.state.NodeStore; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Index definition merge utility that uses the "diff" mode. + */ +public class DiffIndexMerger { + + private final static Logger LOG = LoggerFactory.getLogger(DiffIndexMerger.class); + + public final static String DIFF_INDEX = "diff.index"; + public final static String DIFF_INDEX_OPTIMIZER = "diff.index.optimizer"; + + private final static String MERGE_INFO = "This index was auto-merged. See also https://oak-indexing.github.io/oakTools/simplified.html"; + + // the list of unsupported included paths, e.g. "/apps,/libs" + // by default all paths are supported + private final static String[] UNSUPPORTED_INCLUDED_PATHS = System.getProperty("oak.diffIndex.unsupportedPaths", "").split(","); + + // in case a custom index is removed, whether a dummy index is created + private final static boolean DELETE_CREATES_DUMMY = Boolean.getBoolean("oak.diffIndex.deleteCreatesDummy"); + + // in case a customization was removed, create a copy of the OOTB index + private final static boolean DELETE_COPIES_OOTB = Boolean.getBoolean("oak.diffIndex.deleteCopiesOOTB"); + + // whether to log at info level + private final static boolean LOG_AT_INFO_LEVEL = Boolean.getBoolean("oak.diffIndex.logAtInfoLevel"); + + private final String[] unsupportedIncludedPaths; + private final boolean deleteCreatesDummyIndex; + private final boolean deleteCopiesOutOfTheBoxIndex; + private final boolean logAtInfoLevel; + + static final DiffIndexMerger INSTANCE = new DiffIndexMerger(UNSUPPORTED_INCLUDED_PATHS, + DELETE_CREATES_DUMMY, DELETE_COPIES_OOTB, LOG_AT_INFO_LEVEL); + + public static DiffIndexMerger instance() { + return INSTANCE; + } + + DiffIndexMerger(String[] unsupportedIncludedPaths, + boolean deleteCreatesDummyIndex, boolean deleteCopiesOutOfTheBoxIndex, + boolean logAtInfoLevel) { + this.unsupportedIncludedPaths = unsupportedIncludedPaths; + this.deleteCreatesDummyIndex = deleteCreatesDummyIndex; + this.deleteCopiesOutOfTheBoxIndex = deleteCopiesOutOfTheBoxIndex; + this.logAtInfoLevel = logAtInfoLevel; + } + + /** + * If there is a diff index, that is an index with prefix "diff.", then try to merge it. + * + * @param newImageLuceneDefinitions + * the new indexes + * (input and output) + * @param repositoryDefinitions + * the indexes in the writable repository + * (input) + * @param repositoryNodeStore + */ + public void merge(JsonObject newImageLuceneDefinitions, JsonObject repositoryDefinitions, NodeStore repositoryNodeStore) { + // combine all definitions into one object + JsonObject combined = new JsonObject(); + + // index definitions in the repository + combined.getChildren().putAll(repositoryDefinitions.getChildren()); + + // read the diff.index.optimizer explicitly, + // because it's a not a regular index definition, + // and so in the repositoryDefinitions + if (repositoryNodeStore != null) { + Map<String, JsonObject> diffInRepo = readDiffIndex(repositoryNodeStore, DIFF_INDEX_OPTIMIZER); + combined.getChildren().putAll(diffInRepo); + } + + // overwrite with the provided definitions (if any) + combined.getChildren().putAll(newImageLuceneDefinitions.getChildren()); + + // check if there "diff.index" or "diff.index.optimizer" + boolean found = combined.getChildren().containsKey("/oak:index/" + DIFF_INDEX) + || combined.getChildren().containsKey("/oak:index/" + DIFF_INDEX_OPTIMIZER); + if (!found) { + // early exit, so that the risk of merging the PR + // is very small for customers that do not use this + log("No 'diff.index' definition"); + return; + } + mergeDiff(newImageLuceneDefinitions, combined); + } + + /** + * If there is a diff index (hardcoded node "/oak:index/diff.index" or + * "/oak:index/diff.index.optimizer"), then iterate over all entries and create new + * (merged) versions if needed. + * + * @param newImageLuceneDefinitions + * the new Lucene definitions + * (input + output) + * @param combined + * the definitions in the repository, + * including the one in the customer repo and new ones + * (input) + * @return whether a new version of an index was added + */ + boolean mergeDiff(JsonObject newImageLuceneDefinitions, JsonObject combined) { + // iterate again, this time process + + // collect the diff index(es) + HashMap<String, JsonObject> toProcess = new HashMap<>(); + tryExtractDiffIndex(combined, "/oak:index/" + DIFF_INDEX, toProcess); + tryExtractDiffIndex(combined, "/oak:index/" + DIFF_INDEX_OPTIMIZER, toProcess); + // if the diff index exists, but doesn't contain some of the previous indexes + // (indexes with mergeInfo), then we need to disable those (using /dummy includedPath) + extractExistingMergedIndexes(combined, toProcess); + if (toProcess.isEmpty()) { + log("No diff index definitions found."); + return false; + } + boolean hasChanges = false; + for (Entry<String, JsonObject> e : toProcess.entrySet()) { + String key = e.getKey(); + JsonObject value = e.getValue(); + if (key.startsWith("/oak:index/")) { + LOG.warn("The key should contains just the index name, without the '/oak:index' prefix for key {}", key); + key = key.substring("/oak:index/".length()); + } + log("Processing {}", key); + hasChanges |= processMerge(key, value, newImageLuceneDefinitions, combined); + } + return hasChanges; + } + + /** + * Extract a "diff.index" from the set of index definitions (if found), and if + * found, store the nested entries in the target map, merging them with previous + * entries if found. + * + * The diff.index may either have a file (a "jcr:content" child node with a + * "jcr:data" property), or a "diff" JSON object. For customers (in the git + * repository), the file is much easier to construct, but when running the + * indexing job, the nested JSON is much easier. + * + * @param indexDefs the set of index definitions (may be empty) + * @param name the name of the diff.index (either diff.index or + * diff.index.optimizer) + * @param target the target map of diff.index definitions + * @return the error message trying to parse the JSON file, or null + */ + public static String tryExtractDiffIndex(JsonObject indexDefs, String name, HashMap<String, JsonObject> target) { + JsonObject diffIndex = indexDefs.getChildren().get(name); + if (diffIndex == null) { + return null; + } + // extract either the file, or the nested json + JsonObject file = diffIndex.getChildren().get("diff.json"); + JsonObject diff; + if (file != null) { + // file + JsonObject jcrContent = file.getChildren().get("jcr:content"); + if (jcrContent == null) { + String message = "jcr:content child node is missing in diff.json"; + LOG.warn(message); + return message; + } + String jcrData = JsonNodeBuilder.oakStringValue(jcrContent, "jcr:data"); + try { + diff = JsonObject.fromJson(jcrData, true); + } catch (Exception e) { + LOG.warn("Illegal Json, ignoring: {}", jcrData, e); + String message = "Illegal Json, ignoring: " + e.getMessage(); + return message; + } + } else { + // nested json + diff = diffIndex.getChildren().get("diff"); + } + // store, if not empty + if (diff != null) { + for (Entry<String, JsonObject> e : diff.getChildren().entrySet()) { + String key = e.getKey(); + target.put(key, mergeDiffs(target.get(key), e.getValue())); + } + } + return null; + } + + /** + * Extract the indexes with a "mergeInfo" property and store them in the target + * object. This is needed so that indexes that were removed from the index.diff + * are detected (a new version is needed in this case with includedPaths + * "/dummy"). + * + * @param indexDefs the index definitions in the repository + * @param target the target map of "diff.index" definitions. for each entry + * found, an empty object is added + */ + private static void extractExistingMergedIndexes(JsonObject indexDefs, HashMap<String, JsonObject> target) { + for (Entry<String, JsonObject> e : indexDefs.getChildren().entrySet()) { + String key = e.getKey(); + JsonObject value = e.getValue(); + if (key.indexOf("-custom-") < 0 || !value.getProperties().containsKey("mergeInfo")) { + continue; + } + String baseName = IndexName.parse(key.substring("/oak:index/".length())).getBaseName(); + if (!target.containsKey(baseName)) { + // if there is no entry yet for this key, + // add a new empty object + target.put(baseName, new JsonObject()); + } + } + } + + /** + * Merge diff from "diff.index" and "diff.index.optimizer". + * The customer can define a diff (stored in "diff.index") + * and someone else (or the optimizer) can define one (stored in "diff.index.optimizer"). + * + * @param a the first diff + * @param b the second diff (overwrites entries in a) + * @return the merged entry + */ + public static JsonObject mergeDiffs(JsonObject a, JsonObject b) { + if (a == null) { + return b; + } else if (b == null) { + return a; + } + JsonObject result = JsonObject.fromJson(a.toString(), true); + result.getProperties().putAll(b.getProperties()); + HashSet<String> both = new HashSet<>(a.getChildren().keySet()); + both.addAll(b.getChildren().keySet()); + for (String k : both) { + result.getChildren().put(k, mergeDiffs(a.getChildren().get(k), b.getChildren().get(k))); + } + return result; + } + + /** + * Merge using the diff definition. + * + * If the latest customized index already matches, then + * newImageLuceneDefinitions will remain as is. Otherwise, a new customized + * index is added, with a "mergeInfo" property. + * + * Existing properties are never changed; only new properties/children are + * added. + * + * @param indexName the name, eg. "damAssetLucene" + * @param indexDiff the diff with the new properties + * @param newImageLuceneDefinitions the new Lucene definitions (input + output) + * @param combined the definitions in the repository, including + * the one in the customer repo and new ones + * (input) + * @return whether a new version of an index was added + */ + public boolean processMerge(String indexName, JsonObject indexDiff, JsonObject newImageLuceneDefinitions, JsonObject combined) { + // extract the latest product index (eg. damAssetLucene-12) + // and customized index (eg. damAssetLucene-12-custom-3) - if any + IndexName latestProduct = null; + String latestProductKey = null; + IndexName latestCustomized = null; + String latestCustomizedKey = null; + String prefix = "/oak:index/"; + for (String key : combined.getChildren().keySet()) { + IndexName name = IndexName.parse(key.substring(prefix.length())); + if (!name.isVersioned()) { + log("Ignoring unversioned index {}", name); + continue; + } + if (!name.getBaseName().equals(indexName)) { + continue; + } + boolean isCustom = key.indexOf("-custom-") >= 0; + if (isCustom) { + if (latestCustomized == null || + name.compareTo(latestCustomized) > 0) { + latestCustomized = name; + latestCustomizedKey = key; + } + } else { + if (latestProduct == null || + name.compareTo(latestProduct) > 0) { + latestProduct = name; + latestProductKey = key; + } + } + } + log("Latest product: {}", latestProductKey); + log("Latest customized: {}", latestCustomizedKey); + if (latestProduct == null) { + if (indexName.indexOf('.') >= 0) { + // a fully custom index needs to contains a dot + log("Fully custom index {}", indexName); + } else { + log("No product version for {}", indexName); + return false; + } + } + JsonObject latestProductIndex = combined.getChildren().get(latestProductKey); + String[] includedPaths; + if (latestProductIndex == null) { + if (indexDiff.getProperties().isEmpty() && indexDiff.getChildren().isEmpty()) { + // there is no customization (any more), which means a dummy index may be needed + log("No customization for {}", indexName); + } else { + includedPaths = JsonNodeBuilder.oakStringArrayValue(indexDiff, "includedPaths"); + if (includesUnsupportedPaths(includedPaths)) { + LOG.warn("New custom index {} is not supported because it contains an unsupported path ({})", + indexName, Arrays.toString(unsupportedIncludedPaths)); + return false; + } + } + } else { + includedPaths = JsonNodeBuilder.oakStringArrayValue(latestProductIndex, "includedPaths"); + if (includesUnsupportedPaths(includedPaths)) { + LOG.warn("Customizing index {} is not supported because it contains an unsupported path ({})", + latestProductKey, Arrays.toString(unsupportedIncludedPaths)); + return false; + } + } + + // merge + JsonObject merged = null; + if (indexDiff == null) { Review Comment: There is some code executed in this case (latestCustomized == null...) so that would require changing `processMerge`, which I would avoid. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
