rishabhdaim commented on code in PR #1315: URL: https://github.com/apache/jackrabbit-oak/pull/1315#discussion_r1502951816
########## oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java: ########## @@ -942,21 +949,63 @@ public void collectGarbage(final NodeDocument doc, final GCPhases phases) { final UpdateOp op = new UpdateOp(requireNonNull(doc.getId()), false); op.equals(MODIFIED_IN_SECS, doc.getModified()); - collectDeletedProperties(doc, phases, op); - collectUnmergedBranchCommits(doc, phases, op, toModifiedMs); - collectOldRevisions(doc, phases, op); - // only add if there are changes for this doc - if (op.hasChanges()) { + // traversed state == state of node at doc.id based on head revision + NodeState traversedState = root; + for (String name : doc.getPath().elements()) { + traversedState = traversedState.getChildNode(name); + } + + if (isDeletedOrOrphanedNode(doc, traversedState, phases)) { + // if this is an orphaned node, all that is needed is its removal garbageDocsCount++; totalGarbageDocsCount++; - monitor.info("Collected [{}] garbage for doc [{}]", op.getChanges().size(), doc.getId()); - updateOpList.add(op); + monitor.info("Deleted orphaned or deleted doc [{}]", doc.getId()); + orphanOrDeletedRemovalList.add(doc.getId()); + } else { + collectDeletedProperties(doc, phases, op); + collectUnmergedBranchCommits(doc, phases, op, toModifiedMs); + collectOldRevisions(doc, phases, op); + // only add if there are changes for this doc + if (op.hasChanges()) { + garbageDocsCount++; + totalGarbageDocsCount++; + monitor.info("Collected [{}] garbage for doc [{}]", op.getChanges().size(), doc.getId()); + updateOpList.add(op); + } } if (log.isDebugEnabled()) { log.debug("UpdateOp for {} is {}", doc.getId(), op); } } + /** + * Check if the node represented by the given doc and traversedState is + * <i>orphaned</i>. A node is considered orphaned if it does not have a visible + * parent node. But from a GC point of view this also includes regular + * deletion cases that have not otherwise been deleted already (eg by DeletedDocsGC). + * + * @param doc + * @param traversedState + * @param phases + * @return true if the node is orphaned (and/or can be removed), false + * otherwise + */ + private boolean isDeletedOrOrphanedNode(NodeDocument doc, NodeState traversedState, + GCPhases phases) { Review Comment: ```suggestion private boolean isDeletedOrOrphanedNode(NodeState traversedState) { ``` Only traversedState is used. ########## oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java: ########## @@ -1255,28 +1305,40 @@ public void removeGarbage(final VersionGCStats stats) { } if (!isDetailedGCDryRun) { // only delete these in case it is not a dryRun - List<NodeDocument> oldDocs = ds.findAndUpdate(NODES, updateOpList); - int deletedProps = oldDocs.stream().filter(Objects::nonNull).mapToInt(d -> deletedPropsCountMap.getOrDefault(d.getId(), 0)).sum(); - int updatedDocs = (int) oldDocs.stream().filter(Objects::nonNull).count(); - stats.updatedDetailedGCDocsCount += updatedDocs; - stats.deletedPropsCount += deletedProps; - stats.deletedUnmergedBCCount += deletedUnmergedBCSet.size(); - - if (log.isDebugEnabled()) { - log.debug("Updated [{}] documents, deleted [{}] properties, deleted [{}] unmergedBranchCommits", - updatedDocs, deletedProps, deletedUnmergedBCSet.size()); + + if (!orphanOrDeletedRemovalList.isEmpty()) { + ds.remove(NODES, orphanOrDeletedRemovalList); + final int removedSize = orphanOrDeletedRemovalList.size(); + stats.updatedDetailedGCDocsCount += removedSize; + stats.deletedDocGCCount += removedSize; Review Comment: We need to save these stats as well in RevisionGCStats.documentsDeleted` or we could add other stats in detailedGcStats for orphan node deletion. ########## oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java: ########## @@ -1255,28 +1305,40 @@ public void removeGarbage(final VersionGCStats stats) { } if (!isDetailedGCDryRun) { // only delete these in case it is not a dryRun - List<NodeDocument> oldDocs = ds.findAndUpdate(NODES, updateOpList); - int deletedProps = oldDocs.stream().filter(Objects::nonNull).mapToInt(d -> deletedPropsCountMap.getOrDefault(d.getId(), 0)).sum(); - int updatedDocs = (int) oldDocs.stream().filter(Objects::nonNull).count(); - stats.updatedDetailedGCDocsCount += updatedDocs; - stats.deletedPropsCount += deletedProps; - stats.deletedUnmergedBCCount += deletedUnmergedBCSet.size(); - - if (log.isDebugEnabled()) { - log.debug("Updated [{}] documents, deleted [{}] properties, deleted [{}] unmergedBranchCommits", - updatedDocs, deletedProps, deletedUnmergedBCSet.size()); + + if (!orphanOrDeletedRemovalList.isEmpty()) { + ds.remove(NODES, orphanOrDeletedRemovalList); + final int removedSize = orphanOrDeletedRemovalList.size(); + stats.updatedDetailedGCDocsCount += removedSize; + stats.deletedDocGCCount += removedSize; Review Comment: Please add debug logging about the orphan nodes deletions. ########## oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java: ########## @@ -942,21 +949,63 @@ public void collectGarbage(final NodeDocument doc, final GCPhases phases) { final UpdateOp op = new UpdateOp(requireNonNull(doc.getId()), false); op.equals(MODIFIED_IN_SECS, doc.getModified()); - collectDeletedProperties(doc, phases, op); - collectUnmergedBranchCommits(doc, phases, op, toModifiedMs); - collectOldRevisions(doc, phases, op); - // only add if there are changes for this doc - if (op.hasChanges()) { + // traversed state == state of node at doc.id based on head revision + NodeState traversedState = root; + for (String name : doc.getPath().elements()) { + traversedState = traversedState.getChildNode(name); + } + + if (isDeletedOrOrphanedNode(doc, traversedState, phases)) { Review Comment: ```suggestion if (isDeletedOrOrphanedNode(traversedState)) { ``` ########## oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java: ########## @@ -1255,28 +1305,40 @@ public void removeGarbage(final VersionGCStats stats) { } if (!isDetailedGCDryRun) { // only delete these in case it is not a dryRun - List<NodeDocument> oldDocs = ds.findAndUpdate(NODES, updateOpList); - int deletedProps = oldDocs.stream().filter(Objects::nonNull).mapToInt(d -> deletedPropsCountMap.getOrDefault(d.getId(), 0)).sum(); - int updatedDocs = (int) oldDocs.stream().filter(Objects::nonNull).count(); - stats.updatedDetailedGCDocsCount += updatedDocs; - stats.deletedPropsCount += deletedProps; - stats.deletedUnmergedBCCount += deletedUnmergedBCSet.size(); - - if (log.isDebugEnabled()) { - log.debug("Updated [{}] documents, deleted [{}] properties, deleted [{}] unmergedBranchCommits", - updatedDocs, deletedProps, deletedUnmergedBCSet.size()); + + if (!orphanOrDeletedRemovalList.isEmpty()) { + ds.remove(NODES, orphanOrDeletedRemovalList); + final int removedSize = orphanOrDeletedRemovalList.size(); Review Comment: I would use the return value of `ds.remove` as removedSize not the actual list. There might be a case where db cannot delete all the passed documents. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@jackrabbit.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org