[ https://issues.apache.org/jira/browse/MAPREDUCE-6944?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16728705#comment-16728705 ]
Xianghao Lu edited comment on MAPREDUCE-6944 at 12/27/18 12:10 PM: ------------------------------------------------------------------- I cat not attatch patch file into Attachments area, so paste patch text here. {quote}diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java index def9872..57b39fd 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java @@ -1564,6 +1564,15 @@ public void handle(TaskAttemptEvent event) { Task task = job.getTask(event.getTaskAttemptID().getTaskId()); TaskAttempt attempt = task.getAttempt(event.getTaskAttemptID()); ((EventHandler<TaskAttemptEvent>) attempt).handle(event); + + // fix bug of app hang because of attemptID not removed from earlierFailedMaps in some cases, such as + // 1 allocating a container with PRIORITY_MAP to a rescheduled failed map(should be PRIORITY_FAST_FAIL_MAP) + // 2 a rescheduled failed map is killed or failed without assigned container + if ((((TaskAttemptImpl) attempt).isContainerAssigned() || attempt.isFinished()) + && ((RMContainerAllocator)((ContainerAllocatorRouter)containerAllocator).containerAllocator).scheduledRequests.earlierFailedMaps.size() > 0 + && ((RMContainerAllocator)((ContainerAllocatorRouter)containerAllocator).containerAllocator).scheduledRequests.earlierFailedMaps.remove(event.getTaskAttemptID())){ + LOG.info("Remove " + event.getTaskAttemptID() + " from earlierFailedMaps"); + } } } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskAttemptImpl.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskAttemptImpl.java index d912b60..d890948 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskAttemptImpl.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskAttemptImpl.java @@ -1556,7 +1556,7 @@ protected static TaskAttemptState getExternalState( } // check whether the attempt is assigned if container is not null - boolean isContainerAssigned() { + public boolean isContainerAssigned() { return container != null; } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java index e459cb5..55d9cc2 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java @@ -153,7 +153,7 @@ added to the pending and are ramped up (added to scheduled) based private final AssignedRequests assignedRequests; //holds scheduled requests to be fulfilled by RM - private final ScheduledRequests scheduledRequests = new ScheduledRequests(); + public final ScheduledRequests scheduledRequests = new ScheduledRequests(); private int containersAllocated = 0; private int containersReleased = 0; @@ -1042,11 +1042,10 @@ public Resource getResourceLimit() { Resources.add(assignedMapResource, assignedReduceResource)); } - @Private @VisibleForTesting - class ScheduledRequests { + public class ScheduledRequests { - private final LinkedList<TaskAttemptId> earlierFailedMaps = + public final LinkedList<TaskAttemptId> earlierFailedMaps = new LinkedList<TaskAttemptId>(); /** Maps from a host to a list of Map tasks with data on the host */ @@ -1376,7 +1375,7 @@ private ContainerRequest assignToFailedMap(Container allocated) { new JobCounterUpdateEvent(assigned.attemptID.getTaskId().getJobId()); jce.addCounterUpdate(JobCounter.OTHER_LOCAL_MAPS, 1); eventHandler.handle(jce); - LOG.info("Assigned from earlierFailedMaps"); + LOG.info("Assigned from earlierFailedMaps: " + tId); break; } } {quote} was (Author: luxianghao): I cat not attatch patch file into Attachments area, so paste patch text here. {quote} diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java index def9872..a00354f 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java @@ -1564,6 +1564,14 @@ public void handle(TaskAttemptEvent event) { Task task = job.getTask(event.getTaskAttemptID().getTaskId()); TaskAttempt attempt = task.getAttempt(event.getTaskAttemptID()); ((EventHandler<TaskAttemptEvent>) attempt).handle(event); + + // fix bug of app hang because of attemptID not removed from earlierFailedMaps in some cases, such as + // 1 allocating a container with PRIORITY_MAP to a rescheduled failed map(should be PRIORITY_FAST_FAIL_MAP) + // 2 a rescheduled failed map is killed or failed without assigned container + if (attempt.isFinished() && ((RMContainerAllocator)((ContainerAllocatorRouter)containerAllocator).containerAllocator).scheduledRequests.earlierFailedMaps.size() > 0 + && ((RMContainerAllocator)((ContainerAllocatorRouter)containerAllocator).containerAllocator).scheduledRequests.earlierFailedMaps.remove(event.getTaskAttemptID())){ + LOG.info("Remove " + event.getTaskAttemptID() + " from earlierFailedMaps"); + } } } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java index e459cb5..c99a098 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java @@ -153,7 +153,7 @@ added to the pending and are ramped up (added to scheduled) based private final AssignedRequests assignedRequests; //holds scheduled requests to be fulfilled by RM - private final ScheduledRequests scheduledRequests = new ScheduledRequests(); + public final ScheduledRequests scheduledRequests = new ScheduledRequests(); private int containersAllocated = 0; private int containersReleased = 0; @@ -1042,11 +1042,10 @@ public Resource getResourceLimit() { Resources.add(assignedMapResource, assignedReduceResource)); } - @Private @VisibleForTesting - class ScheduledRequests { + public class ScheduledRequests { - private final LinkedList<TaskAttemptId> earlierFailedMaps = + public final LinkedList<TaskAttemptId> earlierFailedMaps = new LinkedList<TaskAttemptId>(); /** Maps from a host to a list of Map tasks with data on the host */ {quote} > MR job got hanged forever when some NMs unstable for some time > -------------------------------------------------------------- > > Key: MAPREDUCE-6944 > URL: https://issues.apache.org/jira/browse/MAPREDUCE-6944 > Project: Hadoop Map/Reduce > Issue Type: Bug > Components: applicationmaster, resourcemanager > Reporter: YunFan Zhou > Priority: Critical > Attachments: screenshot-1.png > > > We encountered several jobs in the production environment due to the fact > that some of the NM unstable cause one *MAP* of the job to be stuck there, > and the job can't finish properly. > However, the problems we encountered were different from those mentioned in > [https://issues.apache.org/jira/browse/MAPREDUCE-6513]. Because in our > scenario, all of *MR REDUCEs* does not start executing. > But when I manually kill the hanged *MAP*, the job will be finished normally. > {noformat} > 2017-08-17 12:25:06,548 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Reduce slow start > threshold not met. completedMapsForReduceSlowstart 15564 > 2017-08-17 12:25:07,555 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Received > completed container container_e84_1502793246072_73922_01_015700 > 2017-08-17 12:25:07,556 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Recalculating > schedule, headroom=<memory:2218677, vCores:2225> > 2017-08-17 12:25:07,556 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Reduce slow start > threshold not met. completedMapsForReduceSlowstart 15564 > 2017-08-17 12:25:07,556 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: After Scheduling: > PendingReds:1009 ScheduledMaps:1 ScheduledReds:0 AssignedMaps:0 > AssignedReds:0 CompletedMaps:15563 CompletedReds:0 ContAlloc:15723 ContRel:26 > HostLocal:4575 RackLocal:8121 > {noformat} > {noformat} > 2017-08-17 14:49:41,793 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Before > Scheduling: PendingReds:1009 ScheduledMaps:1 ScheduledReds:0 AssignedMaps:1 > AssignedReds:0 CompletedMaps:15563 CompletedReds:0 ContAlloc:15724 ContRel:26 > HostLocal:4575 RackLocal:8121 > 2017-08-17 14:49:41,794 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerRequestor: Applying ask > limit of 1 for priority:5 and capability:<memory:1024, vCores:1> > 2017-08-17 14:49:41,799 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerRequestor: getResources() > for application_1502793246072_73922: ask=1 release= 0 newContainers=0 > finishedContainers=0 resourcelimit=<memory:1711989, vCores:1688> knownNMs=4236 > 2017-08-17 14:49:41,799 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Recalculating > schedule, headroom=<memory:1711989, vCores:1688> > 2017-08-17 14:49:41,799 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Reduce slow start > threshold not met. completedMapsForReduceSlowstart 15564 > 2017-08-17 14:49:42,805 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Got allocated > containers 1 > 2017-08-17 14:49:42,805 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Assigning > container Container: [ContainerId: > container_e84_1502793246072_73922_01_015726, NodeId: > bigdata-hdp-apache1960.xg01.diditaxi.com:8041, NodeHttpAddress: > bigdata-hdp-apache1960.xg01.diditaxi.com:8042, Resource: <memory:1024, > vCores:1>, Priority: 5, Token: Token { kind: ContainerToken, service: > 10.93.111.36:8041 }, ] to fast fail map > 2017-08-17 14:49:42,805 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Assigned from > earlierFailedMaps > 2017-08-17 14:49:42,805 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Assigned > container container_e84_1502793246072_73922_01_015726 to > attempt_1502793246072_73922_m_012103_5 > 2017-08-17 14:49:42,805 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Recalculating > schedule, headroom=<memory:1727349, vCores:1703> > 2017-08-17 14:49:42,805 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Reduce slow start > threshold not met. completedMapsForReduceSlowstart 15564 > 2017-08-17 14:49:42,805 INFO [RMCommunicator Allocator] > org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: After Scheduling: > PendingReds:1009 ScheduledMaps:0 ScheduledReds:0 AssignedMaps:2 > AssignedReds:0 CompletedMaps:15563 CompletedReds:0 ContAlloc:15725 ContRel:26 > HostLocal:4575 RackLocal:8121 > {noformat} > {noformat} > !screenshot-1.png! > {noformat} -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: mapreduce-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: mapreduce-issues-h...@hadoop.apache.org