This is an automated email from the ASF dual-hosted git repository.
noob-se7en pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 6464736bc35 Skip IdealState resource scan in instanceDropSafetyCheck
for minions (#18776)
6464736bc35 is described below
commit 6464736bc35080edf2f171f190005693db0969d5
Author: Shounak kulkarni <[email protected]>
AuthorDate: Tue Jun 16 14:05:09 2026 +0530
Skip IdealState resource scan in instanceDropSafetyCheck for minions
(#18776)
The instance drop safety check scans every resource's IdealState to verify
the instance hosts no resource. Minions never appear in any IdealState
(their
task assignments live in the Helix Task Framework
JobContext/WorkflowContext,
not in IdealState), so the scan can only ever return empty for them. Running
it per minion pulls every IdealState into the controller heap repeatedly,
causing heap pressure when many minions are dropped in succession.
Gate the scan on !InstanceTypeUtils.isMinion(instanceName), keeping the
cheap
liveness (IS_ALIVE) check for all instance types. The scan is still required
for controllers/servers/brokers (e.g. controllers participate in the lead
controller resource), so it is skipped only for minions.
---
.../helix/core/PinotHelixResourceManager.java | 27 ++++++---
.../PinotHelixResourceManagerMinionDrainTest.java | 65 ++++++++++++++++++++++
2 files changed, 83 insertions(+), 9 deletions(-)
diff --git
a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java
b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java
index 4013379f722..fa7ecb8d2a1 100644
---
a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java
+++
b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java
@@ -4222,6 +4222,11 @@ public class PinotHelixResourceManager {
/**
* Utility to perform a safety check of the operation to drop an instance.
* If the resource is not safe to drop the utility lists all the possible
reasons.
+ * <p>The cluster-wide IdealState scan is skipped for minion instances:
minions never appear in any
+ * resource IdealState (their task assignments live in the Helix Task
Framework
+ * {@code JobContext}/{@code WorkflowContext}, not in IdealState), so the
scan can only ever return
+ * empty for them. Skipping it avoids pulling every IdealState into the
controller heap, which is a
+ * significant source of heap pressure when many minions are dropped in
succession.
* @param instanceName Pinot instance name
* @return {@link OperationValidationResponse}
*/
@@ -4231,16 +4236,20 @@ public class PinotHelixResourceManager {
if (_helixDataAccessor.getProperty(_keyBuilder.liveInstance(instanceName))
!= null) {
response.putIssue(OperationValidationResponse.ErrorCode.IS_ALIVE,
instanceName);
}
- // Check if any ideal state includes the instance
- getAllResources().forEach(resource -> {
- IdealState idealState =
_helixAdmin.getResourceIdealState(_helixClusterName, resource);
- for (String partition : idealState.getPartitionSet()) {
- if (idealState.getInstanceSet(partition).contains(instanceName)) {
-
response.putIssue(OperationValidationResponse.ErrorCode.CONTAINS_RESOURCE,
instanceName, resource);
- break;
+ // Check if any ideal state includes the instance. Minions never host any
resource, so skip the
+ // expensive scan for them. Controllers/servers/brokers can host resources
(e.g. controllers are
+ // participants in the lead controller resource), so the scan is still
required for them.
+ if (!InstanceTypeUtils.isMinion(instanceName)) {
+ getAllResources().forEach(resource -> {
+ IdealState idealState =
_helixAdmin.getResourceIdealState(_helixClusterName, resource);
+ for (String partition : idealState.getPartitionSet()) {
+ if (idealState.getInstanceSet(partition).contains(instanceName)) {
+
response.putIssue(OperationValidationResponse.ErrorCode.CONTAINS_RESOURCE,
instanceName, resource);
+ break;
+ }
}
- }
- });
+ });
+ }
return response.setSafe(response.getIssues().isEmpty());
}
diff --git
a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManagerMinionDrainTest.java
b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManagerMinionDrainTest.java
index 03e76636d48..143e2909f28 100644
---
a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManagerMinionDrainTest.java
+++
b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManagerMinionDrainTest.java
@@ -24,10 +24,12 @@ import java.util.List;
import org.apache.helix.HelixManager;
import org.apache.helix.HelixManagerFactory;
import org.apache.helix.model.InstanceConfig;
+import org.apache.pinot.controller.api.resources.OperationValidationResponse;
import org.apache.pinot.controller.helix.ControllerTest;
import org.apache.pinot.spi.config.instance.Instance;
import org.apache.pinot.spi.config.instance.InstanceType;
import org.apache.pinot.spi.utils.CommonConstants.Helix;
+import org.mockito.Mockito;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
@@ -480,6 +482,69 @@ public class PinotHelixResourceManagerMinionDrainTest
extends ControllerTest {
_helixResourceManager.dropInstance(minionInstanceId);
}
+ @Test
+ public void testInstanceDropSafetyCheckSkipsResourceScanForMinion() {
+ // A minion never hosts any resource, so the drop safety check must skip
the expensive cluster-wide
+ // IdealState scan entirely (getAllResources is never invoked) while still
reporting it as safe.
+ String minionHost = "minion-test-safety.example.com";
+ int minionPort = 9530;
+ Instance minionInstance = new Instance(minionHost, minionPort,
InstanceType.MINION,
+ Collections.singletonList(Helix.UNTAGGED_MINION_INSTANCE), null, 0, 0,
0, 0, false);
+ assertTrue(_helixResourceManager.addInstance(minionInstance,
false).isSuccessful());
+ String minionInstanceId = "Minion_" + minionHost + "_" + minionPort;
+
+ PinotHelixResourceManager spy = Mockito.spy(_helixResourceManager);
+ OperationValidationResponse response =
spy.instanceDropSafetyCheck(minionInstanceId);
+
+ assertTrue(response.isSafe(), "Minion should be safe to drop: " +
response.getIssues());
+ // The resource scan must be skipped for minions - this is the behavior
under test.
+ Mockito.verify(spy, Mockito.never()).getAllResources();
+
+ // Cleanup
+ _helixResourceManager.dropInstance(minionInstanceId);
+ }
+
+ @Test
+ public void testInstanceDropSafetyCheckRunsResourceScanForNonMinion() {
+ // Non-minion instances can host resources, so the scan must still run.
This locks the !isMinion
+ // guard: inverting it would skip the scan here and fail this verification.
+ String brokerHost = "broker-test-safety.example.com";
+ int brokerPort = 9531;
+ Instance brokerInstance = new Instance(brokerHost, brokerPort,
InstanceType.BROKER,
+ Collections.singletonList(Helix.UNTAGGED_BROKER_INSTANCE), null, 0, 0,
0, 0, false);
+ assertTrue(_helixResourceManager.addInstance(brokerInstance,
false).isSuccessful());
+ String brokerInstanceId = "Broker_" + brokerHost + "_" + brokerPort;
+
+ PinotHelixResourceManager spy = Mockito.spy(_helixResourceManager);
+ spy.instanceDropSafetyCheck(brokerInstanceId);
+
+ // The resource scan must run for non-minion instances.
+ Mockito.verify(spy, Mockito.atLeastOnce()).getAllResources();
+
+ // Cleanup
+ _helixResourceManager.dropInstance(brokerInstanceId);
+ }
+
+ @Test
+ public void testInstanceDropSafetyCheckFlagsLiveMinion()
+ throws Exception {
+ // The cheap liveness check must still apply to minions even though the
resource scan is skipped:
+ // a live minion is reported as unsafe with an IS_ALIVE issue.
+ String minionHost = "minion-test-live.example.com";
+ int minionPort = 9532;
+ Instance minionInstance = new Instance(minionHost, minionPort,
InstanceType.MINION,
+ Collections.singletonList(Helix.UNTAGGED_MINION_INSTANCE), null, 0, 0,
0, 0, false);
+ assertTrue(_helixResourceManager.addInstance(minionInstance,
false).isSuccessful());
+ String minionInstanceId = "Minion_" + minionHost + "_" + minionPort;
+ createFakeMinionLiveInstance(minionInstanceId);
+
+ OperationValidationResponse response =
_helixResourceManager.instanceDropSafetyCheck(minionInstanceId);
+ assertFalse(response.isSafe(), "Live minion should not be safe to drop");
+ assertTrue(response.getIssues().stream()
+ .anyMatch(issue -> issue.getCode() ==
OperationValidationResponse.ErrorCode.IS_ALIVE),
+ "Live minion drop safety check should produce an IS_ALIVE issue");
+ }
+
@AfterClass
public void tearDown() {
// Disconnect fake minion managers
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]