stefan-egli commented on code in PR #993:
URL: https://github.com/apache/jackrabbit-oak/pull/993#discussion_r1285753432


##########
oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java:
##########
@@ -605,8 +814,161 @@ private void collectDeletedDocuments(GCPhases phases,
                     gc.updateResurrectedDocuments(phases.stats);
                     phases.stop(GCPhase.UPDATING);
                 }
+            }
+        }
+    }
+
+    private class DetailedGC implements Closeable {
+
+        private final RevisionVector headRevision;
+        private final GCMonitor monitor;
+        private final AtomicBoolean cancel;
+        private final Stopwatch timer;
+        private final List<UpdateOp> updateOpList;
+
+        private final Map<String, Integer> deletedPropsCountMap;
+        private int garbageDocsCount;
+        private int totalGarbageDocsCount;
+
+        public DetailedGC(@NotNull RevisionVector headRevision, @NotNull 
GCMonitor monitor, @NotNull AtomicBoolean cancel) {
+            this.headRevision = requireNonNull(headRevision);
+            this.monitor = monitor;
+            this.cancel = cancel;
+            this.updateOpList = new ArrayList<>();
+            this.deletedPropsCountMap = new HashMap<>();
+            this.timer = createUnstarted();
+        }
+
+        public void collectGarbage(final NodeDocument doc, final GCPhases 
phases) {
+
+            monitor.info("Collecting Detailed Garbage for doc [{}]", 
doc.getId());
+
+            final UpdateOp op = new UpdateOp(requireNonNull(doc.getId()), 
false);
+            op.equals(MODIFIED_IN_SECS, doc.getModified());
+
+            collectDeletedProperties(doc, phases, op);
+            collectUnmergedBranchCommitDocument(doc, phases, op);
+            collectOldRevisions(doc, phases, op);
+            // only add if there are changes for this doc
+            if (op.hasChanges()) {
+                garbageDocsCount++;
+                totalGarbageDocsCount++;
+                monitor.info("Collected [{}] garbage for doc [{}]", 
op.getChanges().size(), doc.getId());
+                updateOpList.add(op);
+            }
+        }
+
+        private boolean hasGarbage() {
+            return garbageDocsCount > 0;
+        }
+
+        private void collectUnmergedBranchCommitDocument(final NodeDocument 
doc, final GCPhases phases, final UpdateOp updateOp) {
+            if (phases.start(GCPhase.DETAILED_GC_COLLECT_UNMERGED_BC)){
+                // TODO add unmerged BC collection logic
+                phases.stop(GCPhase.DETAILED_GC_COLLECT_UNMERGED_BC);
+            }
+
+        }
+
+        private void collectDeletedProperties(final NodeDocument doc, final 
GCPhases phases, final UpdateOp updateOp) {
+
+            // get Map of all properties along with their values
+            if (phases.start(GCPhase.DETAILED_GC_COLLECT_PROPS)) {
+                final Set<String> properties = doc.getPropertyNames();
+
+                // find all the properties which can be removed from document.
+                // All the properties whose value is null in head revision are
+                // eligible to be garbage collected.
+
+                final Set<String> retainPropSet = 
ofNullable(doc.getNodeAtRevision(nodeStore, headRevision, null))
+                        .map(DocumentNodeState::getAllBundledProperties)
+                        .map(Map::keySet)
+                        .map(p -> 
p.stream().map(Utils::escapePropertyName).collect(toSet()))
+                        .orElse(emptySet());
+
+                final int deletedPropsGCCount = properties.stream()
+                        .filter(p -> !retainPropSet.contains(p))
+                        .mapToInt(x -> {
+                            updateOp.remove(x);
+                            return 1;})
+                        .sum();
+
+                deletedPropsCountMap.put(doc.getId(), deletedPropsGCCount);
+
+                if (log.isDebugEnabled()) {
+                    log.debug("Collected {} deleted properties for document 
{}", deletedPropsGCCount, doc.getId());
+                }
+                phases.stop(GCPhase.DETAILED_GC_COLLECT_PROPS);
+            }
+        }
+
+        private void collectOldRevisions(NodeDocument doc, GCPhases phases, 
UpdateOp updateOp) {
+
+            if (phases.start(GCPhase.DETAILED_GC_COLLECT_OLD_REVS)){
+                // TODO add old rev collection logic
+                phases.stop(GCPhase.DETAILED_GC_COLLECT_OLD_REVS);
+            }
+
+        }
+
+        int getGarbageCount() {
+            return totalGarbageDocsCount;
+        }
+
+        @Override
+        public void close() {
+            totalGarbageDocsCount = 0;
+        }
+
+        public void removeGarbage(final VersionGCStats stats) {
+
+            if (updateOpList.isEmpty()) {
+                if (log.isDebugEnabled()) {
+                    log.debug("Skipping removal of detailed garbage, cause no 
garbage detected");
+                }
+                return;
+            }
+
+            int updatedDocs;
+
+            monitor.info("Proceeding to update [{}] documents", 
updateOpList.size());
+
+            if (log.isDebugEnabled()) {
+                String collect = 
updateOpList.stream().map(UpdateOp::getId).collect(joining(","));
+                log.debug("Performing batch update of documents with following 
id's [{}]", collect);
+            }
+
+            if (cancel.get()) {
+                log.info("Aborting the removal of detailed garbage since RGC 
had been cancelled");
+                return;
+            }
+
+            timer.reset().start();
+            try {
+                List<NodeDocument> oldDocs = ds.findAndUpdate(NODES, 
updateOpList);
+                int deletedProps = 
oldDocs.stream().filter(Objects::nonNull).mapToInt(d -> 
deletedPropsCountMap.getOrDefault(d.getId(), 0)).sum();
+                updatedDocs = (int) 
oldDocs.stream().filter(Objects::nonNull).count();
+                stats.updatedDetailedGCDocsCount += updatedDocs;
+                stats.deletedPropsGCCount += deletedProps;
+                log.info("Updated [{}] documents, deleted [{}] properties", 
updatedDocs, deletedProps);

Review Comment:
   Undecided about this log.info. In classic GC we do `maybeLogStats`, to 
reduce log frequency. This additional log.info might be noisy, wdyt?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to