This is an automated email from the ASF dual-hosted git repository. jsedding pushed a commit to branch jsedding/OAK-12068-remove-expensive-uuid-deduplication in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit bc7798a4d0d5caeca3e930f2ee298eb7c7ed4b9d Author: Julian Sedding <[email protected]> AuthorDate: Wed Jan 21 12:15:13 2026 +0100 OAK-12068 - segment graph UUID deduplication (OAK-12005) can be too inefficient - remove UUID deduplication in segment graphs - avoid over-allocation of HashMaps of known bounded size --- .../jackrabbit/oak/segment/file/tar/TarFiles.java | 30 ++++++++-------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/file/tar/TarFiles.java b/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/file/tar/TarFiles.java index 283d4c9ded..027da0e8f3 100644 --- a/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/file/tar/TarFiles.java +++ b/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/file/tar/TarFiles.java @@ -42,7 +42,6 @@ import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Predicate; -import java.util.function.UnaryOperator; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -338,7 +337,7 @@ public class TarFiles implements Closeable { private final long maxFileSize; - private SegmentArchiveManager archiveManager; + private final SegmentArchiveManager archiveManager; /** * Guards access to the {@link #readers} and {@link #writer} references. @@ -907,22 +906,15 @@ public class TarFiles implements Closeable { for (TarReader reader : iterable(head)) { if (fileName.equals(reader.getFileName())) { - Map<String, Set<UUID>> indices = getIndices(); - Map<UUID, UUID> uuidDeduplicationMap = indices.values().stream() - .flatMap(Set::stream) - .collect(Collectors.toUnmodifiableMap(Function.identity(), Function.identity())); - UnaryOperator<UUID> uuidDeduplicator = uuid -> uuidDeduplicationMap.getOrDefault(uuid, uuid); - Set<UUID> uuids = indices.get(reader.getFileName()); - Map<UUID, Set<UUID>> edges = reader.getGraph().getEdges(); - // Create a map covering all UUIDs contained in the file's index and deduplicate - // all UUID instances based on the UUIDs already present in _all_ archives' indices. - // This helps to keep the memory overhead during the lifetime of graph-maps to a minimum. - return uuids.stream().collect(Collectors.toUnmodifiableMap( - uuidDeduplicator, - uuid -> edges.getOrDefault(uuid, emptySet()).stream() - .map(uuidDeduplicator) - .collect(Collectors.toUnmodifiableSet()))); - + SegmentGraph graph = reader.getGraph(); + Set<UUID> uuids = reader.getUUIDs(); + return uuids.stream() + .collect(Collectors.toMap( + Function.identity(), + graph::getEdges, + (a, b) -> { a.addAll(b); return a; }, + () -> new HashMap<>(Math.toIntExact(uuids.size()), 1.0f) + )); } } return emptyMap(); @@ -938,7 +930,7 @@ public class TarFiles implements Closeable { lock.readLock().unlock(); } - Map<String, Set<UUID>> index = new HashMap<>(); + Map<String, Set<UUID>> index = new HashMap<>(Math.toIntExact(getSize(head)), 1.0f); for (TarReader reader : iterable(head)) { index.put(reader.getFileName(), reader.getUUIDs()); }
