stefan-egli commented on code in PR #13: URL: https://github.com/apache/sling-org-apache-sling-discovery-oak/pull/13#discussion_r1130749457
########## src/main/java/org/apache/sling/discovery/oak/SlingIdCleanupTask.java: ########## @@ -0,0 +1,571 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sling.discovery.oak; + +import static org.osgi.util.converter.Converters.standardConverter; + +import java.util.Calendar; +import java.util.Date; +import java.util.HashSet; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.sling.api.resource.LoginException; +import org.apache.sling.api.resource.ModifiableValueMap; +import org.apache.sling.api.resource.PersistenceException; +import org.apache.sling.api.resource.Resource; +import org.apache.sling.api.resource.ResourceResolver; +import org.apache.sling.api.resource.ResourceResolverFactory; +import org.apache.sling.api.resource.ValueMap; +import org.apache.sling.commons.scheduler.ScheduleOptions; +import org.apache.sling.commons.scheduler.Scheduler; +import org.apache.sling.discovery.InstanceDescription; +import org.apache.sling.discovery.TopologyEvent; +import org.apache.sling.discovery.TopologyEvent.Type; +import org.apache.sling.discovery.TopologyEventListener; +import org.apache.sling.discovery.TopologyView; +import org.osgi.framework.BundleContext; +import org.osgi.service.component.annotations.Activate; +import org.osgi.service.component.annotations.Component; +import org.osgi.service.component.annotations.Deactivate; +import org.osgi.service.component.annotations.Modified; +import org.osgi.service.component.annotations.Reference; +import org.osgi.service.metatype.annotations.AttributeDefinition; +import org.osgi.service.metatype.annotations.Designate; +import org.osgi.service.metatype.annotations.ObjectClassDefinition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A background task that cleans up garbage slingIds after topology changes. + * <p> + * A slingId is considered garbage when: + * <ul> + * <li>it is not in the current topology</li> + * <li>it is not in the current idmap (where clusterNodeIds are reused hence + * that list stays small and does not need cleanup)</li> + * <li>its leaderElectionId was created more than 7 days ago (the + * leaderElectionId is created at activate time of the discovery.oak bundle - + * hence this more or less corresponds to the startup time of that + * instance)</li> + * </ul> + * The garbage accumulates at the following places, where it will thus be + * cleaned up from: + * <ul> + * <li>as child node under /var/discovery/oak/clusterInstances : this is the + * most performance critical garbage</li> + * <li>as a property key in /var/discovery/oak/syncTokens</li> + * </ul> + * The task by default is executed: + * <ul> + * <li>only on the leader</li> + * <li>10min after a TOPOLOGY_INIT or TOPOLOGY_CHANGED event</li> + * <li>with a maximum number of delete operations to avoid repository overload - + * that maximum is called batchSize and is 50 by default</li> + * <li>in subsequent intervals of 10min after the initial run, if that had to + * stop at the batchSize of 50 deletions</li> + * </ul> + * All parameters mentioned above can be configured. + */ +@Component +@Designate(ocd = SlingIdCleanupTask.Conf.class) +public class SlingIdCleanupTask implements TopologyEventListener, Runnable { + + final static String SLINGID_CLEANUP_ENABLED_SYSTEM_PROPERTY_NAME = "org.apache.sling.discovery.oak.slingidcleanup.enabled"; + + /** + * default age is 1 week : an instance that is not in the current topology, + * started 1 week ago is very unlikely to still be active Review Comment: Worst-case should be that the existing, active instances get into a TOPOLOGY_CHANGING situation and the restarted, racing instance fails to join the topology as well but the instance is running, so oak leases are exchanged. And killing that problematic instance would be required to get back to a proper topology. One thing that I'll try to look into is whether each instance shouldn't keep a list of all slingIds that were ever active (while it was alive). That list shouldn't be large, and holding just those slingIds shouldn't cause any memory issue. But having that list and then not cleaning up any slingId of that list would further reduce the likelihood of the described race-condition. Actually I think it would solve it completely. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
