[ https://issues.apache.org/jira/browse/MAPREDUCE-7458?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Tao Yang updated MAPREDUCE-7458: -------------------------------- Description: There is a rare race condition in *TaskReportPBImpl#getProto* when JobHistoryServer getting concurrent getTaskReports requests for the same job at the same time. Exception scenario: # client calls JobClient#getTaskReports in parallel for the same job at the same time. # JobHistoryServer gets these requests and then generating response based on *cached* task reports according to HistoryClientService$HSClientProtocolHandler#getTaskReports. # When the same task report is processed concurrently, we may see UnsupportedOperationException exceptions with different stacks as following. ExceptionStack-1: TaskReportPBImpl#convertToProtoFormat {noformat} java.lang.UnsupportedOperationException at java.util.AbstractList.add(AbstractList.java:148) at java.util.AbstractList.add(AbstractList.java:108) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:330) at org.apache.hadoop.mapreduce.v2.proto.MRProtos$CounterGroupProto$Builder.addAllCounters(MRProtos.java:4393) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CounterGroupPBImpl.addContersToProto(CounterGroupPBImpl.java:182) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CounterGroupPBImpl.mergeLocalToBuilder(CounterGroupPBImpl.java:63) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CounterGroupPBImpl.mergeLocalToProto(CounterGroupPBImpl.java:70) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CounterGroupPBImpl.getProto(CounterGroupPBImpl.java:55) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.convertToProtoFormat(CountersPBImpl.java:195) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.access$100(CountersPBImpl.java:38) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl$1$1.next(CountersPBImpl.java:162) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl$1$1.next(CountersPBImpl.java:150) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:329) at org.apache.hadoop.mapreduce.v2.proto.MRProtos$CountersProto$Builder.addAllCounterGroups(MRProtos.java:5102) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.addCounterGroupsToProto(CountersPBImpl.java:172) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.mergeLocalToBuilder(CountersPBImpl.java:64) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.mergeLocalToProto(CountersPBImpl.java:71) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.getProto(CountersPBImpl.java:56) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.convertToProtoFormat(TaskReportPBImpl.java:401) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToBuilder(TaskReportPBImpl.java:76) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToProto(TaskReportPBImpl.java:92) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.getProto(TaskReportPBImpl.java:64) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.convertToProtoFormat(GetTaskReportsResponsePBImpl.java:173) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.access$100(GetTaskReportsResponsePBImpl.java:36) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:138) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:127) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:329) at org.apache.hadoop.mapreduce.v2.proto.MRServiceProtos$GetTaskReportsResponseProto$Builder.addAllTaskReports(MRServiceProtos.java:7049) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.addTaskReportsToProto(GetTaskReportsResponsePBImpl.java:150) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToBuilder(GetTaskReportsResponsePBImpl.java:62) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToProto(GetTaskReportsResponsePBImpl.java:69) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.getProto(GetTaskReportsResponsePBImpl.java:54) at org.apache.hadoop.mapreduce.v2.api.impl.pb.service.MRClientProtocolPBServiceImpl.getTaskReports(MRClientProtocolPBServiceImpl.java:186) at org.apache.hadoop.yarn.proto.MRClientProtocol$MRClientProtocolService$2.callBlockingMethod(MRClientProtocol.java:285) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:528) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:999) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:927) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2915){noformat} ExceptionStack-2: TaskReportPBImpl#addDiagnosticsToProto {noformat} java.lang.UnsupportedOperationException at java.util.AbstractList.add(AbstractList.java:148) at java.util.AbstractList.add(AbstractList.java:108) at java.util.AbstractCollection.addAll(AbstractCollection.java:344) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:327) at org.apache.hadoop.mapreduce.v2.proto.MRProtos$TaskReportProto$Builder.addAllDiagnostics(MRProtos.java:7048) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.addDiagnosticsToProto(TaskReportPBImpl.java:378) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToBuilder(TaskReportPBImpl.java:85) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToProto(TaskReportPBImpl.java:92) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.getProto(TaskReportPBImpl.java:64) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.convertToProtoFormat(GetTaskReportsResponsePBImpl.java:173) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.access$100(GetTaskReportsResponsePBImpl.java:36) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:138) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:127) at com.google.protobuf.AbstractMessageLite$Builder.checkForNullValues(AbstractMessageLite.java:336) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:323) at org.apache.hadoop.mapreduce.v2.proto.MRServiceProtos$GetTaskReportsResponseProto$Builder.addAllTaskReports(MRServiceProtos.java:7049) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.addTaskReportsToProto(GetTaskReportsResponsePBImpl.java:150) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToBuilder(GetTaskReportsResponsePBImpl.java:62) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToProto(GetTaskReportsResponsePBImpl.java:69) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.getProto(GetTaskReportsResponsePBImpl.java:54) at org.apache.hadoop.mapreduce.v2.api.impl.pb.service.MRClientProtocolPBServiceImpl.getTaskReports(MRClientProtocolPBServiceImpl.java:186) at org.apache.hadoop.yarn.proto.MRClientProtocol$MRClientProtocolService$2.callBlockingMethod(MRClientProtocol.java:285) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:528) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:999) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:927) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2915){noformat} ExceptionStack-3: TaskReportPBImpl#addRunningAttemptsToProto {noformat} java.lang.UnsupportedOperationException at java.util.AbstractList.add(AbstractList.java:148) at java.util.AbstractList.add(AbstractList.java:108) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:330) at org.apache.hadoop.mapreduce.v2.proto.MRProtos$TaskReportProto$Builder.addAllRunningAttempts(MRProtos.java:6767) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.addRunningAttemptsToProto(TaskReportPBImpl.java:299) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToBuilder(TaskReportPBImpl.java:79) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToProto(TaskReportPBImpl.java:92) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.getProto(TaskReportPBImpl.java:64) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.convertToProtoFormat(GetTaskReportsResponsePBImpl.java:173) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.access$100(GetTaskReportsResponsePBImpl.java:36) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:138) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:127) at com.google.protobuf.AbstractMessageLite$Builder.checkForNullValues(AbstractMessageLite.java:336) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:323) at org.apache.hadoop.mapreduce.v2.proto.MRServiceProtos$GetTaskReportsResponseProto$Builder.addAllTaskReports(MRServiceProtos.java:7049) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.addTaskReportsToProto(GetTaskReportsResponsePBImpl.java:150) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToBuilder(GetTaskReportsResponsePBImpl.java:62) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToProto(GetTaskReportsResponsePBImpl.java:69) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.getProto(GetTaskReportsResponsePBImpl.java:54) at org.apache.hadoop.mapreduce.v2.api.impl.pb.service.MRClientProtocolPBServiceImpl.getTaskReports(MRClientProtocolPBServiceImpl.java:186) at org.apache.hadoop.yarn.proto.MRClientProtocol$MRClientProtocolService$2.callBlockingMethod(MRClientProtocol.java:285) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:528) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:999) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:927) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2915){noformat} I propose to fix this race condition by *adding synchronized flag for TaskReportPBImpl#getProto* like others (e.g.JobReportPBImpl, AMInfoPBImpl, etc.). was: There is a rare race condition in *TaskReportPBImpl.getProto* when JobHistoryServer getting concurrent getTaskReports requests for the same job at the same time. Exception scenario: # client calls JobClient#getTaskReports in parallel for the same job at the same time. # JobHistoryServer gets these requests and then generating response based on *cached* task reports according to HistoryClientService$HSClientProtocolHandler#getTaskReports. # When the same task report is processed concurrently, we may see UnsupportedOperationException exceptions with different stacks as following. ExceptionStack-1: TaskReportPBImpl#convertToProtoFormat {noformat} java.lang.UnsupportedOperationException at java.util.AbstractList.add(AbstractList.java:148) at java.util.AbstractList.add(AbstractList.java:108) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:330) at org.apache.hadoop.mapreduce.v2.proto.MRProtos$CounterGroupProto$Builder.addAllCounters(MRProtos.java:4393) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CounterGroupPBImpl.addContersToProto(CounterGroupPBImpl.java:182) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CounterGroupPBImpl.mergeLocalToBuilder(CounterGroupPBImpl.java:63) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CounterGroupPBImpl.mergeLocalToProto(CounterGroupPBImpl.java:70) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CounterGroupPBImpl.getProto(CounterGroupPBImpl.java:55) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.convertToProtoFormat(CountersPBImpl.java:195) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.access$100(CountersPBImpl.java:38) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl$1$1.next(CountersPBImpl.java:162) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl$1$1.next(CountersPBImpl.java:150) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:329) at org.apache.hadoop.mapreduce.v2.proto.MRProtos$CountersProto$Builder.addAllCounterGroups(MRProtos.java:5102) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.addCounterGroupsToProto(CountersPBImpl.java:172) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.mergeLocalToBuilder(CountersPBImpl.java:64) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.mergeLocalToProto(CountersPBImpl.java:71) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.getProto(CountersPBImpl.java:56) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.convertToProtoFormat(TaskReportPBImpl.java:401) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToBuilder(TaskReportPBImpl.java:76) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToProto(TaskReportPBImpl.java:92) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.getProto(TaskReportPBImpl.java:64) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.convertToProtoFormat(GetTaskReportsResponsePBImpl.java:173) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.access$100(GetTaskReportsResponsePBImpl.java:36) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:138) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:127) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:329) at org.apache.hadoop.mapreduce.v2.proto.MRServiceProtos$GetTaskReportsResponseProto$Builder.addAllTaskReports(MRServiceProtos.java:7049) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.addTaskReportsToProto(GetTaskReportsResponsePBImpl.java:150) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToBuilder(GetTaskReportsResponsePBImpl.java:62) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToProto(GetTaskReportsResponsePBImpl.java:69) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.getProto(GetTaskReportsResponsePBImpl.java:54) at org.apache.hadoop.mapreduce.v2.api.impl.pb.service.MRClientProtocolPBServiceImpl.getTaskReports(MRClientProtocolPBServiceImpl.java:186) at org.apache.hadoop.yarn.proto.MRClientProtocol$MRClientProtocolService$2.callBlockingMethod(MRClientProtocol.java:285) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:528) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:999) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:927) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2915){noformat} ExceptionStack-2: TaskReportPBImpl#addDiagnosticsToProto {noformat} java.lang.UnsupportedOperationException at java.util.AbstractList.add(AbstractList.java:148) at java.util.AbstractList.add(AbstractList.java:108) at java.util.AbstractCollection.addAll(AbstractCollection.java:344) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:327) at org.apache.hadoop.mapreduce.v2.proto.MRProtos$TaskReportProto$Builder.addAllDiagnostics(MRProtos.java:7048) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.addDiagnosticsToProto(TaskReportPBImpl.java:378) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToBuilder(TaskReportPBImpl.java:85) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToProto(TaskReportPBImpl.java:92) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.getProto(TaskReportPBImpl.java:64) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.convertToProtoFormat(GetTaskReportsResponsePBImpl.java:173) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.access$100(GetTaskReportsResponsePBImpl.java:36) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:138) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:127) at com.google.protobuf.AbstractMessageLite$Builder.checkForNullValues(AbstractMessageLite.java:336) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:323) at org.apache.hadoop.mapreduce.v2.proto.MRServiceProtos$GetTaskReportsResponseProto$Builder.addAllTaskReports(MRServiceProtos.java:7049) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.addTaskReportsToProto(GetTaskReportsResponsePBImpl.java:150) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToBuilder(GetTaskReportsResponsePBImpl.java:62) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToProto(GetTaskReportsResponsePBImpl.java:69) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.getProto(GetTaskReportsResponsePBImpl.java:54) at org.apache.hadoop.mapreduce.v2.api.impl.pb.service.MRClientProtocolPBServiceImpl.getTaskReports(MRClientProtocolPBServiceImpl.java:186) at org.apache.hadoop.yarn.proto.MRClientProtocol$MRClientProtocolService$2.callBlockingMethod(MRClientProtocol.java:285) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:528) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:999) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:927) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2915){noformat} ExceptionStack-3: TaskReportPBImpl#addRunningAttemptsToProto {noformat} java.lang.UnsupportedOperationException at java.util.AbstractList.add(AbstractList.java:148) at java.util.AbstractList.add(AbstractList.java:108) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:330) at org.apache.hadoop.mapreduce.v2.proto.MRProtos$TaskReportProto$Builder.addAllRunningAttempts(MRProtos.java:6767) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.addRunningAttemptsToProto(TaskReportPBImpl.java:299) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToBuilder(TaskReportPBImpl.java:79) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToProto(TaskReportPBImpl.java:92) at org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.getProto(TaskReportPBImpl.java:64) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.convertToProtoFormat(GetTaskReportsResponsePBImpl.java:173) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.access$100(GetTaskReportsResponsePBImpl.java:36) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:138) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:127) at com.google.protobuf.AbstractMessageLite$Builder.checkForNullValues(AbstractMessageLite.java:336) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:323) at org.apache.hadoop.mapreduce.v2.proto.MRServiceProtos$GetTaskReportsResponseProto$Builder.addAllTaskReports(MRServiceProtos.java:7049) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.addTaskReportsToProto(GetTaskReportsResponsePBImpl.java:150) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToBuilder(GetTaskReportsResponsePBImpl.java:62) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToProto(GetTaskReportsResponsePBImpl.java:69) at org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.getProto(GetTaskReportsResponsePBImpl.java:54) at org.apache.hadoop.mapreduce.v2.api.impl.pb.service.MRClientProtocolPBServiceImpl.getTaskReports(MRClientProtocolPBServiceImpl.java:186) at org.apache.hadoop.yarn.proto.MRClientProtocol$MRClientProtocolService$2.callBlockingMethod(MRClientProtocol.java:285) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:528) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:999) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:927) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2915){noformat} I propose to fix this race condition by *adding synchronized flag for TaskReportPBImpl#getProto* like others (e.g.JobReportPBImpl, AMInfoPBImpl, etc.). > Race condition in TaskReportPBImpl#getProto when generating task reports > process in concurrency scenarios > --------------------------------------------------------------------------------------------------------- > > Key: MAPREDUCE-7458 > URL: https://issues.apache.org/jira/browse/MAPREDUCE-7458 > Project: Hadoop Map/Reduce > Issue Type: Bug > Components: jobhistoryserver > Affects Versions: 3.3.6 > Reporter: Tao Yang > Priority: Major > Labels: pull-request-available > > There is a rare race condition in *TaskReportPBImpl#getProto* when > JobHistoryServer getting concurrent getTaskReports requests for the same job > at the same time. > Exception scenario: > # client calls JobClient#getTaskReports in parallel for the same job at the > same time. > # JobHistoryServer gets these requests and then generating response based on > *cached* task reports according to > HistoryClientService$HSClientProtocolHandler#getTaskReports. > # When the same task report is processed concurrently, we may see > UnsupportedOperationException exceptions with different stacks as following. > ExceptionStack-1: TaskReportPBImpl#convertToProtoFormat > {noformat} > java.lang.UnsupportedOperationException > at java.util.AbstractList.add(AbstractList.java:148) > at java.util.AbstractList.add(AbstractList.java:108) > at > com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:330) > at > org.apache.hadoop.mapreduce.v2.proto.MRProtos$CounterGroupProto$Builder.addAllCounters(MRProtos.java:4393) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CounterGroupPBImpl.addContersToProto(CounterGroupPBImpl.java:182) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CounterGroupPBImpl.mergeLocalToBuilder(CounterGroupPBImpl.java:63) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CounterGroupPBImpl.mergeLocalToProto(CounterGroupPBImpl.java:70) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CounterGroupPBImpl.getProto(CounterGroupPBImpl.java:55) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.convertToProtoFormat(CountersPBImpl.java:195) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.access$100(CountersPBImpl.java:38) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl$1$1.next(CountersPBImpl.java:162) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl$1$1.next(CountersPBImpl.java:150) > at > com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:329) > at > org.apache.hadoop.mapreduce.v2.proto.MRProtos$CountersProto$Builder.addAllCounterGroups(MRProtos.java:5102) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.addCounterGroupsToProto(CountersPBImpl.java:172) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.mergeLocalToBuilder(CountersPBImpl.java:64) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.mergeLocalToProto(CountersPBImpl.java:71) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.CountersPBImpl.getProto(CountersPBImpl.java:56) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.convertToProtoFormat(TaskReportPBImpl.java:401) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToBuilder(TaskReportPBImpl.java:76) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToProto(TaskReportPBImpl.java:92) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.getProto(TaskReportPBImpl.java:64) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.convertToProtoFormat(GetTaskReportsResponsePBImpl.java:173) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.access$100(GetTaskReportsResponsePBImpl.java:36) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:138) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:127) > at > com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:329) > at > org.apache.hadoop.mapreduce.v2.proto.MRServiceProtos$GetTaskReportsResponseProto$Builder.addAllTaskReports(MRServiceProtos.java:7049) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.addTaskReportsToProto(GetTaskReportsResponsePBImpl.java:150) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToBuilder(GetTaskReportsResponsePBImpl.java:62) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToProto(GetTaskReportsResponsePBImpl.java:69) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.getProto(GetTaskReportsResponsePBImpl.java:54) > at > org.apache.hadoop.mapreduce.v2.api.impl.pb.service.MRClientProtocolPBServiceImpl.getTaskReports(MRClientProtocolPBServiceImpl.java:186) > at > org.apache.hadoop.yarn.proto.MRClientProtocol$MRClientProtocolService$2.callBlockingMethod(MRClientProtocol.java:285) > at > org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:528) > at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070) > at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:999) > at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:927) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730) > at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2915){noformat} > ExceptionStack-2: TaskReportPBImpl#addDiagnosticsToProto > > {noformat} > java.lang.UnsupportedOperationException > at java.util.AbstractList.add(AbstractList.java:148) > at java.util.AbstractList.add(AbstractList.java:108) > at java.util.AbstractCollection.addAll(AbstractCollection.java:344) > at > com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:327) > at > org.apache.hadoop.mapreduce.v2.proto.MRProtos$TaskReportProto$Builder.addAllDiagnostics(MRProtos.java:7048) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.addDiagnosticsToProto(TaskReportPBImpl.java:378) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToBuilder(TaskReportPBImpl.java:85) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToProto(TaskReportPBImpl.java:92) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.getProto(TaskReportPBImpl.java:64) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.convertToProtoFormat(GetTaskReportsResponsePBImpl.java:173) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.access$100(GetTaskReportsResponsePBImpl.java:36) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:138) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:127) > at > com.google.protobuf.AbstractMessageLite$Builder.checkForNullValues(AbstractMessageLite.java:336) > at > com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:323) > at > org.apache.hadoop.mapreduce.v2.proto.MRServiceProtos$GetTaskReportsResponseProto$Builder.addAllTaskReports(MRServiceProtos.java:7049) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.addTaskReportsToProto(GetTaskReportsResponsePBImpl.java:150) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToBuilder(GetTaskReportsResponsePBImpl.java:62) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToProto(GetTaskReportsResponsePBImpl.java:69) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.getProto(GetTaskReportsResponsePBImpl.java:54) > at > org.apache.hadoop.mapreduce.v2.api.impl.pb.service.MRClientProtocolPBServiceImpl.getTaskReports(MRClientProtocolPBServiceImpl.java:186) > at > org.apache.hadoop.yarn.proto.MRClientProtocol$MRClientProtocolService$2.callBlockingMethod(MRClientProtocol.java:285) > at > org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:528) > at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070) > at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:999) > at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:927) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730) > at > org.apache.hadoop.ipc.Server$Handler.run(Server.java:2915){noformat} > > ExceptionStack-3: TaskReportPBImpl#addRunningAttemptsToProto > > {noformat} > java.lang.UnsupportedOperationException > at java.util.AbstractList.add(AbstractList.java:148) > at java.util.AbstractList.add(AbstractList.java:108) > at > com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:330) > at > org.apache.hadoop.mapreduce.v2.proto.MRProtos$TaskReportProto$Builder.addAllRunningAttempts(MRProtos.java:6767) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.addRunningAttemptsToProto(TaskReportPBImpl.java:299) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToBuilder(TaskReportPBImpl.java:79) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.mergeLocalToProto(TaskReportPBImpl.java:92) > at > org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskReportPBImpl.getProto(TaskReportPBImpl.java:64) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.convertToProtoFormat(GetTaskReportsResponsePBImpl.java:173) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.access$100(GetTaskReportsResponsePBImpl.java:36) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:138) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl$1$1.next(GetTaskReportsResponsePBImpl.java:127) > at > com.google.protobuf.AbstractMessageLite$Builder.checkForNullValues(AbstractMessageLite.java:336) > at > com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:323) > at > org.apache.hadoop.mapreduce.v2.proto.MRServiceProtos$GetTaskReportsResponseProto$Builder.addAllTaskReports(MRServiceProtos.java:7049) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.addTaskReportsToProto(GetTaskReportsResponsePBImpl.java:150) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToBuilder(GetTaskReportsResponsePBImpl.java:62) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.mergeLocalToProto(GetTaskReportsResponsePBImpl.java:69) > at > org.apache.hadoop.mapreduce.v2.api.protocolrecords.impl.pb.GetTaskReportsResponsePBImpl.getProto(GetTaskReportsResponsePBImpl.java:54) > at > org.apache.hadoop.mapreduce.v2.api.impl.pb.service.MRClientProtocolPBServiceImpl.getTaskReports(MRClientProtocolPBServiceImpl.java:186) > at > org.apache.hadoop.yarn.proto.MRClientProtocol$MRClientProtocolService$2.callBlockingMethod(MRClientProtocol.java:285) > at > org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:528) > at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070) > at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:999) > at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:927) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730) > at > org.apache.hadoop.ipc.Server$Handler.run(Server.java:2915){noformat} > > > I propose to fix this race condition by *adding synchronized flag for > TaskReportPBImpl#getProto* like others (e.g.JobReportPBImpl, AMInfoPBImpl, > etc.). -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: mapreduce-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: mapreduce-issues-h...@hadoop.apache.org