This is an automated email from the ASF dual-hosted git repository. thomasm pushed a commit to branch OAK-11457 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit 03c03e742434098983bd3456af45f081c9074d6d Author: Thomas Mueller <[email protected]> AuthorDate: Wed Feb 5 18:02:55 2025 +0100 OAK-11457 Tree store sometimes contains bundled properties --- .../flatfile/pipelined/PipelinedTreeStoreTask.java | 73 +++++++++++++++++++++- .../store/RemovePropertiesOfBundledNodesTest.java | 56 +++++++++++++++++ 2 files changed, 128 insertions(+), 1 deletion(-) diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedTreeStoreTask.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedTreeStoreTask.java index d8688eb4e2..9e855eda96 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedTreeStoreTask.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedTreeStoreTask.java @@ -34,12 +34,17 @@ import java.util.concurrent.Callable; import java.util.concurrent.TimeUnit; import org.apache.jackrabbit.guava.common.base.Stopwatch; +import org.apache.jackrabbit.oak.commons.json.JsopBuilder; +import org.apache.jackrabbit.oak.commons.json.JsopReader; +import org.apache.jackrabbit.oak.commons.json.JsopTokenizer; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.NodeStateEntryReader; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.PipelinedSortBatchTask.Result; import org.apache.jackrabbit.oak.index.indexer.document.tree.TreeStore; import org.apache.jackrabbit.oak.index.indexer.document.tree.store.TreeSession; import org.apache.jackrabbit.oak.plugins.index.IndexingReporter; import org.apache.jackrabbit.oak.plugins.index.MetricsFormatter; import org.apache.jackrabbit.oak.plugins.index.MetricsUtils; +import org.apache.jackrabbit.oak.spi.blob.MemoryBlobStore; import org.apache.jackrabbit.oak.stats.StatisticsProvider; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -207,7 +212,9 @@ public class PipelinedTreeStoreTask implements Callable<PipelinedSortBatchTask.R int valueLength = buffer.getInt(); String value = new String(buffer.array(), buffer.arrayOffset() + buffer.position(), valueLength, StandardCharsets.UTF_8); textSize += entry.getPath().length() + value.length() + 2; - treeStore.putNode(entry.getPath(), value); + String path = entry.getPath(); + value = removePropertiesOfBundledNodes(path, value); + treeStore.putNode(path, value); } session.checkpoint(); unmergedRoots++; @@ -230,4 +237,68 @@ public class PipelinedTreeStoreTask implements Callable<PipelinedSortBatchTask.R } } + /** + * If there are any, remove properties of bundled nodes (jcr:content/...) from the JSON-encoded node. + * + * @param path the path + * @param value the JSON-encoded node + * @return the cleaned JSON + */ + public static String removePropertiesOfBundledNodes(String path, String value) { + if (value.indexOf("\"jcr:content/") < 0) { + return value; + } + // possibly the node contains a bundled property, but we are not sure + // try to de-serialize + NodeStateEntryReader nodeReader = new NodeStateEntryReader(new MemoryBlobStore()); + try { + // the following line will throw an exception if de-serialization fails + nodeReader.read(path + "|" + value); + // ok it did not: it was a false positive + return value; + } catch (Exception e) { + LOG.warn("Path {} value {}", path, value); + JsopReader reader = new JsopTokenizer(value); + JsopBuilder writer = new JsopBuilder(); + reader.read('{'); + writer.object(); + if (!reader.matches('}')) { + do { + String key = reader.readString(); + reader.read(':'); + // skip properties that contain "/" + boolean skip = key.indexOf('/') >= 0; + if (!skip) { + writer.key(key); + } + if (reader.matches('[')) { + if (!skip) { + writer.array(); + } + do { + String raw = reader.readRawValue(); + if (!skip) { + writer.encodedValue(raw); + } + } while (reader.matches(',')); + reader.read(']'); + if (!skip) { + writer.endArray(); + } + } else { + String raw = reader.readRawValue(); + if (!skip) { + writer.encodedValue(raw); + } + } + } while (reader.matches(',')); + } + reader.read('}'); + writer.endObject(); + String result = writer.toString(); + LOG.warn("Cleaned {} : {}", path, result); + return result; + } + } + } diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/tree/store/RemovePropertiesOfBundledNodesTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/tree/store/RemovePropertiesOfBundledNodesTest.java new file mode 100644 index 0000000000..a4c8ea4f92 --- /dev/null +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/tree/store/RemovePropertiesOfBundledNodesTest.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.index.indexer.document.tree.store; + +import static org.junit.Assert.assertEquals; + +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.PipelinedTreeStoreTask; +import org.junit.Test; + +public class RemovePropertiesOfBundledNodesTest { + + @Test + public void cleanUp() { + // this is similar to the real-world case + verify("{\"jcr:created\":\"dat:2020-05-06T17:15:13.971Z\",\"jcr:primaryType\":\"nam:nt:file\",\"jcr:createdBy\":\"admin\",\"jcr:content/jcr:lastModified\":\"dat:2025-01-21T03:37:42.095Z\",\"jcr:content/jcr:lastModifiedBy\":\"test\"}", + "{\"jcr:created\":\"dat:2020-05-06T17:15:13.971Z\",\"jcr:primaryType\":\"nam:nt:file\",\"jcr:createdBy\":\"admin\"}"); + + // generic entries + verify("{}", "{}"); + verify("{\"c\":null,\"b\":\"x\",\"a\":123,\"d\":[1,2,null,\"x\"]}", + "{\"c\":null,\"b\":\"x\",\"a\":123,\"d\":[1,2,null,\"x\"]}"); + + // false positive + verify("{\"c\":\"jcr:content/that\"}", + "{\"c\":\"jcr:content/that\"}"); + + // generic entries that need cleaning + verify("{\"c\":null,\"jcr:content/this\":null,\"a\":123,\"jcr:content/that\":[1,2,null,\"x\"]}", + "{\"c\":null,\"a\":123}"); + verify("{\"c\":null,\"jcr:content/this\":null,\"a\":123,\"array\":[1,2,null,\"x\"]}", + "{\"c\":null,\"a\":123,\"array\":[1,2,null,\"x\"]}"); + + } + + static void verify(String input, String expected) { + String v2 = PipelinedTreeStoreTask.removePropertiesOfBundledNodes("/test", input); + assertEquals(expected, v2); + + } +}
