stefan-egli commented on code in PR #993:
URL: https://github.com/apache/jackrabbit-oak/pull/993#discussion_r1285753432
##########
oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java:
##########
@@ -605,8 +814,161 @@ private void collectDeletedDocuments(GCPhases phases,
gc.updateResurrectedDocuments(phases.stats);
phases.stop(GCPhase.UPDATING);
}
+ }
+ }
+ }
+
+ private class DetailedGC implements Closeable {
+
+ private final RevisionVector headRevision;
+ private final GCMonitor monitor;
+ private final AtomicBoolean cancel;
+ private final Stopwatch timer;
+ private final List<UpdateOp> updateOpList;
+
+ private final Map<String, Integer> deletedPropsCountMap;
+ private int garbageDocsCount;
+ private int totalGarbageDocsCount;
+
+ public DetailedGC(@NotNull RevisionVector headRevision, @NotNull
GCMonitor monitor, @NotNull AtomicBoolean cancel) {
+ this.headRevision = requireNonNull(headRevision);
+ this.monitor = monitor;
+ this.cancel = cancel;
+ this.updateOpList = new ArrayList<>();
+ this.deletedPropsCountMap = new HashMap<>();
+ this.timer = createUnstarted();
+ }
+
+ public void collectGarbage(final NodeDocument doc, final GCPhases
phases) {
+
+ monitor.info("Collecting Detailed Garbage for doc [{}]",
doc.getId());
+
+ final UpdateOp op = new UpdateOp(requireNonNull(doc.getId()),
false);
+ op.equals(MODIFIED_IN_SECS, doc.getModified());
+
+ collectDeletedProperties(doc, phases, op);
+ collectUnmergedBranchCommitDocument(doc, phases, op);
+ collectOldRevisions(doc, phases, op);
+ // only add if there are changes for this doc
+ if (op.hasChanges()) {
+ garbageDocsCount++;
+ totalGarbageDocsCount++;
+ monitor.info("Collected [{}] garbage for doc [{}]",
op.getChanges().size(), doc.getId());
+ updateOpList.add(op);
+ }
+ }
+
+ private boolean hasGarbage() {
+ return garbageDocsCount > 0;
+ }
+
+ private void collectUnmergedBranchCommitDocument(final NodeDocument
doc, final GCPhases phases, final UpdateOp updateOp) {
+ if (phases.start(GCPhase.DETAILED_GC_COLLECT_UNMERGED_BC)){
+ // TODO add unmerged BC collection logic
+ phases.stop(GCPhase.DETAILED_GC_COLLECT_UNMERGED_BC);
+ }
+
+ }
+
+ private void collectDeletedProperties(final NodeDocument doc, final
GCPhases phases, final UpdateOp updateOp) {
+
+ // get Map of all properties along with their values
+ if (phases.start(GCPhase.DETAILED_GC_COLLECT_PROPS)) {
+ final Set<String> properties = doc.getPropertyNames();
+
+ // find all the properties which can be removed from document.
+ // All the properties whose value is null in head revision are
+ // eligible to be garbage collected.
+
+ final Set<String> retainPropSet =
ofNullable(doc.getNodeAtRevision(nodeStore, headRevision, null))
+ .map(DocumentNodeState::getAllBundledProperties)
+ .map(Map::keySet)
+ .map(p ->
p.stream().map(Utils::escapePropertyName).collect(toSet()))
+ .orElse(emptySet());
+
+ final int deletedPropsGCCount = properties.stream()
+ .filter(p -> !retainPropSet.contains(p))
+ .mapToInt(x -> {
+ updateOp.remove(x);
+ return 1;})
+ .sum();
+
+ deletedPropsCountMap.put(doc.getId(), deletedPropsGCCount);
+
+ if (log.isDebugEnabled()) {
+ log.debug("Collected {} deleted properties for document
{}", deletedPropsGCCount, doc.getId());
+ }
+ phases.stop(GCPhase.DETAILED_GC_COLLECT_PROPS);
+ }
+ }
+
+ private void collectOldRevisions(NodeDocument doc, GCPhases phases,
UpdateOp updateOp) {
+
+ if (phases.start(GCPhase.DETAILED_GC_COLLECT_OLD_REVS)){
+ // TODO add old rev collection logic
+ phases.stop(GCPhase.DETAILED_GC_COLLECT_OLD_REVS);
+ }
+
+ }
+
+ int getGarbageCount() {
+ return totalGarbageDocsCount;
+ }
+
+ @Override
+ public void close() {
+ totalGarbageDocsCount = 0;
+ }
+
+ public void removeGarbage(final VersionGCStats stats) {
+
+ if (updateOpList.isEmpty()) {
+ if (log.isDebugEnabled()) {
+ log.debug("Skipping removal of detailed garbage, cause no
garbage detected");
+ }
+ return;
+ }
+
+ int updatedDocs;
+
+ monitor.info("Proceeding to update [{}] documents",
updateOpList.size());
+
+ if (log.isDebugEnabled()) {
+ String collect =
updateOpList.stream().map(UpdateOp::getId).collect(joining(","));
+ log.debug("Performing batch update of documents with following
id's [{}]", collect);
+ }
+
+ if (cancel.get()) {
+ log.info("Aborting the removal of detailed garbage since RGC
had been cancelled");
+ return;
+ }
+
+ timer.reset().start();
+ try {
+ List<NodeDocument> oldDocs = ds.findAndUpdate(NODES,
updateOpList);
+ int deletedProps =
oldDocs.stream().filter(Objects::nonNull).mapToInt(d ->
deletedPropsCountMap.getOrDefault(d.getId(), 0)).sum();
+ updatedDocs = (int)
oldDocs.stream().filter(Objects::nonNull).count();
+ stats.updatedDetailedGCDocsCount += updatedDocs;
+ stats.deletedPropsGCCount += deletedProps;
+ log.info("Updated [{}] documents, deleted [{}] properties",
updatedDocs, deletedProps);
Review Comment:
Undecided about this log.info. In classic GC we do `maybeLogStats`, to
reduce log frequency. This additional log.info might be noisy, wdyt?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]