[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13535863#comment-13535863 ] Hudson commented on YARN-230: - Integrated in Hadoop-Yarn-trunk #70 (See [https://builds.apache.org/job/Hadoop-Yarn-trunk/70/]) YARN-230. RM Restart phase 1 - includes support for saving/restarting all applications on an RM bounce. Contributed by Bikas Saha. (Revision 1423758) Result = SUCCESS acmurthy : http://svn.apache.org/viewcvs.cgi/?root=Apache-SVNview=revrev=1423758 Files : * /hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ApplicationAttemptStateData.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ApplicationStateData.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ApplicationAttemptStateDataPBImpl.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ApplicationStateDataPBImpl.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ApplicationSubmissionContextPBImpl.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestGetGroups.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestYarnClient.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClientRMService.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMContext.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMContextImpl.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileRMStateStore.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/MemoryRMStateStore.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/NullRMStateStore.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreAppAttemptEvent.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreEvent.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreEventType.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreFactory.java *
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13535937#comment-13535937 ] Hudson commented on YARN-230: - Integrated in Hadoop-Hdfs-trunk #1259 (See [https://builds.apache.org/job/Hadoop-Hdfs-trunk/1259/]) YARN-230. RM Restart phase 1 - includes support for saving/restarting all applications on an RM bounce. Contributed by Bikas Saha. (Revision 1423758) Result = FAILURE acmurthy : http://svn.apache.org/viewcvs.cgi/?root=Apache-SVNview=revrev=1423758 Files : * /hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ApplicationAttemptStateData.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ApplicationStateData.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ApplicationAttemptStateDataPBImpl.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ApplicationStateDataPBImpl.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ApplicationSubmissionContextPBImpl.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestGetGroups.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestYarnClient.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClientRMService.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMContext.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMContextImpl.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileRMStateStore.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/MemoryRMStateStore.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/NullRMStateStore.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreAppAttemptEvent.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreEvent.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreEventType.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreFactory.java *
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13535634#comment-13535634 ] Hudson commented on YARN-230: - Integrated in Hadoop-trunk-Commit #3140 (See [https://builds.apache.org/job/Hadoop-trunk-Commit/3140/]) YARN-230. RM Restart phase 1 - includes support for saving/restarting all applications on an RM bounce. Contributed by Bikas Saha. (Revision 1423758) Result = SUCCESS acmurthy : http://svn.apache.org/viewcvs.cgi/?root=Apache-SVNview=revrev=1423758 Files : * /hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ApplicationAttemptStateData.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ApplicationStateData.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ApplicationAttemptStateDataPBImpl.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ApplicationStateDataPBImpl.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ApplicationSubmissionContextPBImpl.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestGetGroups.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestYarnClient.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClientRMService.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMContext.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMContextImpl.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileRMStateStore.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/MemoryRMStateStore.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/NullRMStateStore.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreAppAttemptEvent.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreEvent.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreEventType.java * /hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreFactory.java *
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13534044#comment-13534044 ] Tom White commented on YARN-230: Arun, yes it looks good to me, +1. We can address any changes that come up in later JIRAs. Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: PB-impl.patch, Recovery.patch, Store.patch, Test.patch, YARN-230.1.patch, YARN-230.4.patch, YARN-230.5.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13534098#comment-13534098 ] Bikas Saha commented on YARN-230: - Thanks guys! Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: PB-impl.patch, Recovery.patch, Store.patch, Test.patch, YARN-230.1.patch, YARN-230.4.patch, YARN-230.5.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13532371#comment-13532371 ] Tom White commented on YARN-230: OK, in which case please make the default state store the filesystem one with the default URI discussed earlier. Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: PB-impl.patch, Recovery.patch, Store.patch, Test.patch, YARN-230.1.patch, YARN-230.4.patch, YARN-230.5.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13532499#comment-13532499 ] Bikas Saha commented on YARN-230: - Yes. Will do when I refresh the patch on YARN-231 after YARN-230 gets committed. Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: PB-impl.patch, Recovery.patch, Store.patch, Test.patch, YARN-230.1.patch, YARN-230.4.patch, YARN-230.5.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13532597#comment-13532597 ] Bikas Saha commented on YARN-230: - A note on testing. Among other test changes, there is a functional test that takes the RM through different scenarios of applications being stored, run and re-run as well as nodes heartbeating and reconnecting on restart. I have manually tested the scenarios on a single node setup with ZK and FileSystem implementations of the RMStateStore. Arinto has run the code on a cluster using ZK for storage and verified that it works as expected. https://issues.apache.org/jira/browse/YARN-128?focusedCommentId=13505615page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-13505615 Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: PB-impl.patch, Recovery.patch, Store.patch, Test.patch, YARN-230.1.patch, YARN-230.4.patch, YARN-230.5.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13531147#comment-13531147 ] Tom White commented on YARN-230: Thanks for addressing my feedback Bikas. The NullRMStateStore is a good idea. With it, there is no need for yarn.resourcemanager.recovery.enabled, instead make the default yarn.resourcemanager.store.class the NullRMStateStore. For this to work NullRMStateStore's loadState method should return an unpopulated RMState object rather than null. Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: PB-impl.patch, Recovery.patch, Store.patch, Test.patch, YARN-230.1.patch, YARN-230.4.patch, YARN-230.5.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13529601#comment-13529601 ] Hadoop QA commented on YARN-230: {color:red}-1 overall{color}. Here are the results of testing the latest attachment http://issues.apache.org/jira/secure/attachment/12560499/YARN-230.5.patch against trunk revision . {color:green}+1 @author{color}. The patch does not contain any @author tags. {color:green}+1 tests included{color}. The patch appears to include 20 new or modified test files. {color:green}+1 javac{color}. The applied patch does not increase the total number of javac compiler warnings. {color:green}+1 javadoc{color}. The javadoc tool did not generate any warning messages. {color:green}+1 eclipse:eclipse{color}. The patch built with eclipse:eclipse. {color:green}+1 findbugs{color}. The patch does not introduce any new Findbugs (version 1.3.9) warnings. {color:green}+1 release audit{color}. The applied patch does not increase the total number of release audit warnings. {color:red}-1 core tests{color}. The patch failed these unit tests in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests: org.apache.hadoop.yarn.server.resourcemanager.TestApplicationCleanup org.apache.hadoop.yarn.server.resourcemanager.TestFifoScheduler org.apache.hadoop.yarn.server.resourcemanager.applicationsmanager.TestAMRMRPCNodeUpdates org.apache.hadoop.yarn.server.resourcemanager.webapp.TestRMWebServicesApps org.apache.hadoop.yarn.server.resourcemanager.TestApplicationMasterLauncher org.apache.hadoop.yarn.server.resourcemanager.TestRM org.apache.hadoop.yarn.server.resourcemanager.TestAppManager org.apache.hadoop.yarn.server.resourcemanager.security.TestApplicationTokens org.apache.hadoop.yarn.server.resourcemanager.applicationsmanager.TestAMRMRPCResponseId org.apache.hadoop.yarn.server.resourcemanager.TestAMAuthorization org.apache.hadoop.yarn.server.resourcemanager.security.TestClientTokens org.apache.hadoop.yarn.server.TestContainerManagerSecurity {color:green}+1 contrib tests{color}. The patch passed contrib unit tests. Test results: https://builds.apache.org/job/PreCommit-YARN-Build/216//testReport/ Console output: https://builds.apache.org/job/PreCommit-YARN-Build/216//console This message is automatically generated. Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: PB-impl.patch, Recovery.patch, Store.patch, Test.patch, YARN-230.1.patch, YARN-230.4.patch, YARN-230.5.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13507475#comment-13507475 ] Hadoop QA commented on YARN-230: {color:green}+1 overall{color}. Here are the results of testing the latest attachment http://issues.apache.org/jira/secure/attachment/1228/YARN-230.4.patch against trunk revision . {color:green}+1 @author{color}. The patch does not contain any @author tags. {color:green}+1 tests included{color}. The patch appears to include 20 new or modified test files. {color:green}+1 javac{color}. The applied patch does not increase the total number of javac compiler warnings. {color:green}+1 javadoc{color}. The javadoc tool did not generate any warning messages. {color:green}+1 eclipse:eclipse{color}. The patch built with eclipse:eclipse. {color:green}+1 findbugs{color}. The patch does not introduce any new Findbugs (version 1.3.9) warnings. {color:green}+1 release audit{color}. The applied patch does not increase the total number of release audit warnings. {color:green}+1 core tests{color}. The patch passed unit tests in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests. {color:green}+1 contrib tests{color}. The patch passed contrib unit tests. Test results: https://builds.apache.org/job/PreCommit-YARN-Build/201//testReport/ Console output: https://builds.apache.org/job/PreCommit-YARN-Build/201//console This message is automatically generated. Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: PB-impl.patch, Recovery.patch, Store.patch, Test.patch, YARN-230.1.patch, YARN-230.4.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13505427#comment-13505427 ] Tom White commented on YARN-230: bq. But it could make sense to remove application attempts but not remove the application, couldn't it? Say we want to remove some attempt from the saved state before the application is done. Let's add it when we need it then. bq. We also need to change the AM retry default to 1. Otherwise, even with RM restart enabled, the restarted attempts will fail because the previous AM will delete job files. What is your suggestion for that? I think this is where the killed/failed distinction comes in. If the app attempt was killed (because the RM died), then the app will be retried since the first attempt didn't count (from the point of view of yarn.resourcemanager.am.max-retries). This should be taken care of in YARN-218 - does that sound OK to you? Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: PB-impl.patch, Recovery.patch, Store.patch, Test.patch, YARN-230.1.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13505567#comment-13505567 ] Hadoop QA commented on YARN-230: {color:red}-1 overall{color}. Here are the results of testing the latest attachment http://issues.apache.org/jira/secure/attachment/12554383/Test.patch against trunk revision . {color:red}-1 patch{color}. The patch command could not apply the patch. Console output: https://builds.apache.org/job/PreCommit-YARN-Build/170//console This message is automatically generated. Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: PB-impl.patch, Recovery.patch, Store.patch, Test.patch, YARN-230.1.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13504765#comment-13504765 ] Tom White commented on YARN-230: {quote} The store and remove methods have been made mirrors because it helps maintain symmetry of operations that is logically clear. An actual implementation could choose to remove the entire app data including attempts in removeApplication() making removeApplicationAttempt() a no-op. So that alternative is not precluded in the current interface while still maintaining flexibility at the interface. {quote} Why is this flexibility needed? I can't see why it makes sense to remove an application and leave some application attempts around. bq. I chose to not use directories for FileSystem because one could put a key value store behind a FileSystem interface and I am not sure how directories would work in them. That's reasonable. With the orphan handling (deletion) on restart, the flat structure you have should work fine. (However, I don't think you need the removeApplicationAttempt() method.) bq. One improvement would be to update the store with an attempts final state (failed/killed/succeeded) and wait for it to be recorded before completing the state machine. I agree this can be done later. bq. Could you please help by providing a good system path. How about something like ${hadoop.tmp.dir}/yarn/system/rm-store? Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: PB-impl.patch, Recovery.patch, Store.patch, Test.patch, YARN-230.1.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13504775#comment-13504775 ] Bikas Saha commented on YARN-230: - bq. Why is this flexibility needed? I can't see why it makes sense to remove an application and leave some application attempts around. I agree. It does not make sense to remove application but not remove application attempts. But it could make sense to remove application attempts but not remove the application, couldn't it? Say we want to remove some attempt from the saved state before the application is done. I can update the patch with defaults for filesystem and the suggested path. On this note, for MR jobs just enabling these defaults is not enough. We also need to change the AM retry default to 1. Otherwise, even with RM restart enabled, the restarted attempts will fail because the previous AM will delete job files. What is your suggestion for that? Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: PB-impl.patch, Recovery.patch, Store.patch, Test.patch, YARN-230.1.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13503371#comment-13503371 ] Bikas Saha commented on YARN-230: - Ignoring orphaned attempts was explicitly added because of the current implementation of deleting the app info first to act as a marker about app completion even if attempts failed to get deleted after that. The code should have discarded orphaned attempts from the store but I forgot about that. Good catch! Will fix. The store and remove methods have been made mirrors because it helps maintain symmetry of operations that is logically clear. An actual implementation could choose to remove the entire app data including attempts in removeApplication() making removeApplicationAttempt() a no-op. So that alternative is not precluded in the current interface while still maintaining flexibility at the interface. Also, the directory implementation can still be done in which case removeApplication() could call FS.delete(Path_to_dir) and removeApplicationAttempt() would remove the attempt file under the app directory or return success if the app dir has already been deleted. I chose to not use directories for FileSystem because one could put a key value store behind a FileSystem interface and I am not sure how directories would work in them. Also rmdir is atomic on HDFS but may not be atomic on every file system. For HDFS one could certainly write a directory based file structure for store in which apps would have their own directories. But IMO the best implementation might be a transaction log type implementation similar to what HBase uses I think. It might also have better HA characteristics because HDFS guarantees single writer to a file. That however requires considerable investment of time. It is really hard to guarantee atomicity of removal when we dont know how the file system is implemented. We could use log structure implementations or for HDFS we could use atomic rmdir. Also, lets look at the following scenario. We cannot removeApplication() until we know that the AM has exited and the job is really done. Just after the RM knows that the job is done, the RM could die before updating state. So upon restart we can never guarantee that a completed application was recorded as completed. This is one reason why I chose not to make the state machine wait for removeApplication() to complete. One improvement would be to update the store with an attempts final state (failed/killed/succeeded) and wait for it to be recorded before completing the state machine. This would allow us to not count killed as failed and also complete the application state machine if the last attempt had succeeded. This would implement the preferable solution in your second point above. This would still be an optimization since the RM could fail before storing the attempt state (like above) and we are back to square one. I would like to make this change after YARN-218 is done so that all related changes can be made together. I consciously chose to not provide defaults for the store because I think its important that users understand and think about it when they enable a store. And changing the config helps trigger important questions like which store works for me, what permissions are needed etc. I you still feel strongly about it then I could add defaults like you suggest. Could you please help by providing a good system path. I am not quite familiar with typical rules used to determine them. I have explained the temporary choice of Exception on YARN-231. I will address the remaining comments in the next patch. Thanks for all the feedback. This is a good discussion. I am sure that there are improvements to be made. Unless there are big issues with the current state of the work it would be great if we can commit it and address improvements in subsequent sub-tasks. This would help keep the changes smaller and easier to manage. The current code refactors and places basic interface/infrastructure in place. What do you think? Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: PB-impl.patch, Recovery.patch, Store.patch, Test.patch, YARN-230.1.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13503228#comment-13503228 ] Tom White commented on YARN-230: Overall, this looks great. General feedback: * Can we make application removal atomic? If the RM shuts down after a completed application is removed from the state store, but before the app attempts are removed from the store, then the app attempts may be orphaned. (There's a comment about it in FileSystemRMStateStore, but no action is taken so the attempt files will remain in the store.) It might be better to make RMStateStore#removeApplicationState responsible for removing the app attempts (i.e. remove removeApplicationAttemptState). This would solve the orphaning problem, and it would also make it possible to store the app attempts in a directory nested under the application directory, which would be nicer from a scaling point of view, and also for someone having to debug the state on the filesystem. * If the RM shuts down before a (successful) completed application is removed from the state store, will it be rerun on restart, or will the fact that a successful app attempt was stored mean that it doesn't need to? Obviously, the second one would be preferable. * The exceptions thrown by the public methods of RMStateStore should be more specific than Exception. * Let's have a default for yarn.resourcemanager.store.class in yarn-default.xml. StoreFactory has MemoryRMStateStore as the default, but that's not useful when running on a cluster; FileSystemRMStateStore would be better. Similarly it would be good to have the default location for the store be a system directory on the default file system. With these two changes folks would only need to set yarn.resourcemanager.recovery.enabled to true to enable recovery. (We might make that enabled by default at some point too.) * MemoryRMStateStore#removeApplicationState will fail if asserts are disabled: the remove method should be called in a separate statement and assigned to a variable which can be checked in the assert. It's worth checking if this problem exists elsewhere. * Naming nit: Store was renamed to RMStateStore, but so StoreFactory should be renamed to RMStateStoreFactory. * Naming nit: zk.rm-state-store rather than zk.rmstatestore for consistency with other property names. Also for fs.rmstatestore, and zk.rmstatestore.parentpath (parent-path). Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: PB-impl.patch, Recovery.patch, Store.patch, Test.patch, YARN-230.1.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (YARN-230) Make changes for RM restart phase 1
[ https://issues.apache.org/jira/browse/YARN-230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=13501213#comment-13501213 ] Bikas Saha commented on YARN-230: - Attached patch applies on top of patch for YARN-229 Make changes for RM restart phase 1 --- Key: YARN-230 URL: https://issues.apache.org/jira/browse/YARN-230 Project: Hadoop YARN Issue Type: Sub-task Components: resourcemanager Reporter: Bikas Saha Assignee: Bikas Saha Attachments: YARN-230.1.patch As described in YARN-128, phase 1 of RM restart puts in place mechanisms to save application state and read them back after restart. Upon restart, the NM's are asked to reboot and the previously running AM's are restarted. After this is done, RM HA and work preserving restart can continue in parallel. For more details please refer to the design document in YARN-128 -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira