This is an automated email from the ASF dual-hosted git repository.
snemeth pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 9638985 YARN-7913. Improve error handling when application recovery
fails with exception. Contributed by Wilfred Spiegelenburg
9638985 is described below
commit 96389854280cde4e1cf58a79bc227d0b11f266bf
Author: Szilard Nemeth <[email protected]>
AuthorDate: Wed Jan 22 16:30:59 2020 +0100
YARN-7913. Improve error handling when application recovery fails with
exception. Contributed by Wilfred Spiegelenburg
---
.../scheduler/fair/FairScheduler.java | 43 ++++---
.../scheduler/fair/FairSchedulerTestBase.java | 25 ++++
.../scheduler/fair/TestFairScheduler.java | 126 +++++++++++++++++++++
3 files changed, 179 insertions(+), 15 deletions(-)
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
index 013cf9d..c8d1bf32 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
@@ -470,27 +470,40 @@ public class FairScheduler extends
return;
}
+ writeLock.lock();
try {
- writeLock.lock();
RMApp rmApp = rmContext.getRMApps().get(applicationId);
+ // This will re-create the queue on restore, however this could fail if
+ // the config was changed.
FSLeafQueue queue = assignToQueue(rmApp, queueName, user);
if (queue == null) {
- return;
+ if (!isAppRecovering) {
+ return;
+ }
+ // app is recovering we do not want to fail the app now as it was there
+ // before we started the recovery. Add it to the recovery queue:
+ // dynamic queue directly under root, no ACL needed (auto clean up)
+ queueName = "root.recovery";
+ queue = queueMgr.getLeafQueue(queueName, true);
}
- // Enforce ACLs
- UserGroupInformation userUgi = UserGroupInformation.createRemoteUser(
- user);
-
- if (!queue.hasAccess(QueueACL.SUBMIT_APPLICATIONS, userUgi) && !queue
- .hasAccess(QueueACL.ADMINISTER_QUEUE, userUgi)) {
- String msg = "User " + userUgi.getUserName()
- + " cannot submit applications to queue " + queue.getName()
- + "(requested queuename is " + queueName + ")";
- LOG.info(msg);
- rmContext.getDispatcher().getEventHandler().handle(
- new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, msg));
- return;
+ // Skip ACL check for recovering applications: they have been accepted
+ // in the queue already recovery should not break that.
+ if (!isAppRecovering) {
+ // Enforce ACLs
+ UserGroupInformation userUgi = UserGroupInformation.createRemoteUser(
+ user);
+
+ if (!queue.hasAccess(QueueACL.SUBMIT_APPLICATIONS, userUgi) && !queue
+ .hasAccess(QueueACL.ADMINISTER_QUEUE, userUgi)) {
+ String msg = "User " + userUgi.getUserName()
+ + " cannot submit applications to queue " + queue.getName()
+ + "(requested queuename is " + queueName + ")";
+ LOG.info(msg);
+ rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, msg));
+ return;
+ }
}
SchedulerApplication<FSAppAttempt> application =
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairSchedulerTestBase.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairSchedulerTestBase.java
index b998564..8ed670f 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairSchedulerTestBase.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairSchedulerTestBase.java
@@ -250,6 +250,31 @@ public class FairSchedulerTestBase {
scheduler.update();
}
+ protected ApplicationAttemptId createRecoveringApplication(
+ Resource amResource, String queueId, String userId) {
+ ApplicationAttemptId id =
+ createAppAttemptId(this.APP_ID++, this.ATTEMPT_ID++);
+
+ // On restore the app is already created but we need to check the AM
+ // resource, make sure it is set for test
+ ResourceRequest amRequest = createResourceRequest(
+ // cast to int as we're not testing large values so it is safe
+ (int)amResource.getMemorySize(), amResource.getVirtualCores(),
+ ResourceRequest.ANY, 1, 1, true);
+ List<ResourceRequest> amReqs = new ArrayList<>();
+ amReqs.add(amRequest);
+ RMContext rmContext = resourceManager.getRMContext();
+ ApplicationId appId = id.getApplicationId();
+ RMApp rmApp = new RMAppImpl(appId, rmContext, conf, null, userId, null,
+ ApplicationSubmissionContext.newInstance(appId, null, queueId, null,
+ mock(ContainerLaunchContext.class), false, false, 0, amResource,
+ null), scheduler, null, 0, null, null, amReqs);
+ rmContext.getRMApps().put(appId, rmApp);
+
+ scheduler.addApplication(appId, queueId, userId, true);
+ return id;
+ }
+
protected void createApplicationWithAMResource(ApplicationAttemptId attId,
String queue, String user, Resource amResource) {
RMContext rmContext = resourceManager.getRMContext();
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java
index 60b7344..23b04f5 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java
@@ -5451,4 +5451,130 @@ public class TestFairScheduler extends
FairSchedulerTestBase {
SchedulerUtils.COMPLETED_APPLICATION),
RMContainerEventType.EXPIRE);
}
+
+ @Test
+ public void testRestoreToExistingQueue() throws IOException {
+ conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE);
+ generateAllocationFilePercentageResource();
+
+ scheduler.init(conf);
+ scheduler.start();
+ scheduler.reinitialize(conf, resourceManager.getRMContext());
+
+ // no nodes so the resource total should be zero for all queues
+ // AM is using resources
+ Resource amResource = Resources.createResource(1024, 1);
+ // Add the app and the attempt
+ ApplicationAttemptId appAttemptId = null;
+ String queueId = "root.parent.queueA";
+ try {
+ appAttemptId = createRecoveringApplication(amResource, queueId, "user1");
+ } catch (Exception e) {
+ fail("The exception is not expected. Exception message: "
+ + e.getMessage());
+ }
+ scheduler.addApplicationAttempt(appAttemptId, false, true);
+
+ List<ApplicationAttemptId> appsInQueue =
+ scheduler.getAppsInQueue(queueId);
+ assertEquals("Number of apps in queue 'root.parent.queueA' should be one!",
+ 1, appsInQueue.size());
+
+ appAttemptId = scheduler.getAppsInQueue(queueId).get(0);
+ assertNotNull("Scheduler app for appAttemptId " + appAttemptId
+ + " should not be null!", scheduler.getSchedulerApp(appAttemptId));
+
+ FSAppAttempt schedulerApp = scheduler.getSchedulerApp(appAttemptId);
+ assertNotNull("Scheduler app queueInfo for appAttemptId " + appAttemptId
+ + " should not be null!", schedulerApp.getAppSchedulingInfo());
+
+ assertTrue("There should be no requests accepted", schedulerApp
+ .getAppSchedulingInfo().getAllResourceRequests().isEmpty());
+
+ // Restore an applications with a user that has no access to the queue
+ try {
+ appAttemptId = createRecoveringApplication(amResource, queueId,
+ "usernotinacl");
+ } catch (Exception e) {
+ fail("The exception is not expected. Exception message: "
+ + e.getMessage());
+ }
+ scheduler.addApplicationAttempt(appAttemptId, false, true);
+
+ appsInQueue = scheduler.getAppsInQueue(queueId);
+ assertEquals("Number of apps in queue 'root.parent.queueA' should be two!",
+ 2, appsInQueue.size());
+
+ appAttemptId = scheduler.getAppsInQueue(queueId).get(1);
+ assertNotNull("Scheduler app for appAttemptId " + appAttemptId
+ + " should not be null!", scheduler.getSchedulerApp(appAttemptId));
+
+ schedulerApp = scheduler.getSchedulerApp(appAttemptId);
+ assertNotNull("Scheduler app queueInfo for appAttemptId " + appAttemptId
+ + " should not be null!", schedulerApp.getAppSchedulingInfo());
+
+ assertTrue("There should be no requests accepted", schedulerApp
+ .getAppSchedulingInfo().getAllResourceRequests().isEmpty());
+ }
+
+ @Test
+ public void testRestoreToParentQueue() throws IOException {
+ conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE);
+ generateAllocationFilePercentageResource();
+
+ scheduler.init(conf);
+ scheduler.start();
+ scheduler.reinitialize(conf, resourceManager.getRMContext());
+
+ // no nodes so the resource total should be zero for all queues
+ // AM is using resources
+ Resource amResource = Resources.createResource(1024, 1);
+ // Add the app and the attempt, use a queue that is now a parent
+ ApplicationAttemptId appAttemptId = null;
+ String queueId = "root.parent";
+ try {
+ appAttemptId = createRecoveringApplication(amResource, queueId, "user1");
+ } catch (Exception e) {
+ fail("The exception is not expected. Exception message: "
+ + e.getMessage());
+ }
+ scheduler.addApplicationAttempt(appAttemptId, false, true);
+
+ String recoveredQueue = "root.recovery";
+ List<ApplicationAttemptId> appsInQueue =
+ scheduler.getAppsInQueue(recoveredQueue);
+ assertEquals("Number of apps in queue 'root.recovery' should be one!",
+ 1, appsInQueue.size());
+
+ appAttemptId =
+ scheduler.getAppsInQueue(recoveredQueue).get(0);
+ assertNotNull("Scheduler app for appAttemptId " + appAttemptId
+ + " should not be null!", scheduler.getSchedulerApp(appAttemptId));
+
+ FSAppAttempt schedulerApp = scheduler.getSchedulerApp(appAttemptId);
+ assertNotNull("Scheduler app queueInfo for appAttemptId " + appAttemptId
+ + " should not be null!", schedulerApp.getAppSchedulingInfo());
+
+ assertTrue("There should be no requests accepted", schedulerApp
+ .getAppSchedulingInfo().getAllResourceRequests().isEmpty());
+ }
+
+ private void generateAllocationFilePercentageResource()
+ throws IOException {
+ PrintWriter out = new PrintWriter(new FileWriter(ALLOC_FILE));
+ out.println("<?xml version=\"1.0\"?>");
+ out.println("<allocations>");
+ out.println("<queue name=\"root\">");
+ out.println("<aclSubmitApps> </aclSubmitApps>");
+ out.println("<aclAdministerApps> </aclAdministerApps>");
+ out.println("<queue name=\"parent\">");
+ out.println("<maxChildResources>15.0%</maxChildResources>");
+ out.println("<queue name=\"queueA\">");
+ out.println("<aclSubmitApps>user1</aclSubmitApps>");
+ out.println("</queue>");
+ out.println("</queue>");
+ out.println("</queue>");
+ out.println("</allocations>");
+ out.close();
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]