[ https://issues.apache.org/jira/browse/HDDS-2376?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16964661#comment-16964661 ]
Sammi Chen edited comment on HDDS-2376 at 11/1/19 6:56 AM: ----------------------------------------------------------- The root cause is I didn't retart Hadoop 2.7.5 after I deploied the latest Ozone binary. So the Hadoop still use an old version Ozone client(2 month before) . This OzoneChecksumException is thrown out by NodeManager. Logs attached. It seems something is changed in Ozone server side, which makes an old version Ozone client cann't verify the data written by itself. [~msingh] and [~hanishakoneru], thanks for pay attention to this issue. I will close it now. 2019-11-01 11:46:02,230 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc: Failed to execute command cmdType: ReadChunk traceID: "" containerID: 1145 datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb" readChunk { blockID { containerID: 1145 localID: 103060600027086850 blockCommitSequenceId: 948 } chunkData { chunkName: "103060600027086850_chunk_1" offset: 0 len: 245 checksumData { type: CRC32 bytesPerChecksum: 1048576 checksums: "\247\304Yf" } } } on datanode 1da74a1d-f64d-4ad4-b04c-85f26687e683 org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at index 0 at org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233) at org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) at org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) at org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) at org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52) at java.io.DataInputStream.read(DataInputStream.java:100) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120) at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366) at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267) at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2019-11-01 11:46:02,243 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc: Failed to execute command cmdType: ReadChunk traceID: "" containerID: 1145 datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb" readChunk { blockID { containerID: 1145 localID: 103060600027086850 blockCommitSequenceId: 948 } chunkData { chunkName: "103060600027086850_chunk_1" offset: 0 len: 245 checksumData { type: CRC32 bytesPerChecksum: 1048576 checksums: "\247\304Yf" } } } on datanode ed90869c-317e-4303-8922-9fa83a3983cb org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at index 0 at org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233) at org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) at org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) at org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) at org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52) at java.io.DataInputStream.read(DataInputStream.java:100) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120) at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366) at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267) at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2019-11-01 11:46:02,262 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc: Failed to execute command cmdType: ReadChunk traceID: "" containerID: 1145 datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb" readChunk { blockID { containerID: 1145 localID: 103060600027086850 blockCommitSequenceId: 948 } chunkData { chunkName: "103060600027086850_chunk_1" offset: 0 len: 245 checksumData { type: CRC32 bytesPerChecksum: 1048576 checksums: "\247\304Yf" } } } on datanode b65b0b6c-b0bb-429f-a23d-467c72d4b85c org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at index 0 at org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233) at org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) at org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) at org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) at org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52) at java.io.DataInputStream.read(DataInputStream.java:100) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120) at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366) at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267) at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2019-11-01 11:46:02,263 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc: Failed to execute command cmdType: ReadChunk traceID: "" containerID: 1145 datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb" readChunk { blockID { containerID: 1145 localID: 103060600027086850 blockCommitSequenceId: 948 } chunkData { chunkName: "103060600027086850_chunk_1" offset: 0 len: 245 checksumData { type: CRC32 bytesPerChecksum: 1048576 checksums: "\247\304Yf" } } } on the pipeline Pipeline[ Id: 37b13907-60ea-4101-a7d9-6f02ebaf52f6, Nodes: ed90869c-317e-4303-8922-9fa83a3983cb{ip: 10.120.113.172, host: host172, networkLocation: /rack2, certSerialId: null}1da74a1d-f64d-4ad4-b04c-85f26687e683{ip: 10.121.124.44, host: host044, networkLocation: /rack2, certSerialId: null}b65b0b6c-b0bb-429f-a23d-467c72d4b85c{ip: 10.120.139.111, host: host111, networkLocation: /rack1, certSerialId: null}, Type:STAND_ALONE, Factor:THREE, State:OPEN]. 2019-11-01 11:46:02,266 WARN org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService: { o3fs://bucket.hadoop/tmp/hadoop-yarn/staging/root/.staging/job_1567133159094_0055/job.splitmetainfo, 1572579956554, FILE, null } failed: Unexpected OzoneException: org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at index 0 java.io.IOException: Unexpected OzoneException: org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at index 0 at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:342) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) at org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) at org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) at org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52) at java.io.DataInputStream.read(DataInputStream.java:100) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120) at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366) at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267) at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at index 0 at org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233) at org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335) ... 26 more Caused by: Checksum mismatch at index 0 org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at index 0 at org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233) at org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) at org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) at org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) at org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52) at java.io.DataInputStream.read(DataInputStream.java:100) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120) at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366) at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267) at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2019-11-01 11:46:02,266 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.LocalizedResource: Resource o3fs://bucket.hadoop/tmp/hadoop-yarn/staging/root/.staging/job_1567133159094_0055/job.splitmetainfo(->/data1/usercache/root/appcache/application_1567133159094_0055/filecache/10/job.splitmetainfo) transitioned from DOWNLOADING to FAILED 2019-11-01 11:46:02,266 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl: Container container_1567133159094_0055_01_000001 transitioned from LOCALIZING to LOCALIZATION_FAILED 2019-11-01 11:46:02,266 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.LocalResourcesTrackerImpl: Container container_1567133159094_0055_01_000001 sent RELEASE event on a resource request { o3fs://bucket.hadoop/tmp/hadoop-yarn/staging/root/.staging/job_1567133159094_0055/job.splitmetainfo, 1572579956554, FILE, null } not present in cache. 2019-11-01 11:46:02,266 WARN org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger: USER=root OPERATION=Container Finished - Failed TARGET=ContainerImpl RESULT=FAILURE DESCRIPTION=Container failed with state: LOCALIZATION_FAILED APPID=application_1567133159094_0055 CONTAINERID=container_1567133159094_0055_01_000001 2019-11-01 11:46:02,268 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl: Container container_1567133159094_0055_01_000001 transitioned from LOCALIZATION_FAILED to DONE 2019-11-01 11:46:02,268 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl: Removing container_1567133159094_0055_01_000001 from application application_1567133159094_0055 2019-11-01 11:46:02,268 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices: Got event CONTAINER_STOP for appId application_1567133159094_0055 2019-11-01 11:46:02,270 WARN org.apache.hadoop.ipc.Client: interrupted waiting to send rpc request to server java.lang.InterruptedException at java.util.concurrent.FutureTask.awaitDone(FutureTask.java:404) at java.util.concurrent.FutureTask.get(FutureTask.java:191) at org.apache.hadoop.ipc.Client$Connection.sendRpcRequest(Client.java:1060) at org.apache.hadoop.ipc.Client.call(Client.java:1455) at org.apache.hadoop.ipc.Client.call(Client.java:1413) at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:229) at com.sun.proxy.$Proxy75.heartbeat(Unknown Source) at org.apache.hadoop.yarn.server.nodemanager.api.impl.pb.client.LocalizationProtocolPBClientImpl.heartbeat(LocalizationProtocolPBClientImpl.java:63) at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.localizeFiles(ContainerLocalizer.java:255) at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.runLocalization(ContainerLocalizer.java:169) at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.startLocalizer(DefaultContainerExecutor.java:130) at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService$LocalizerRunner.run(ResourceLocalizationService.java:1117) 2019-11-01 11:46:02,270 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService: Unknown localizer with localizerId container_1567133159094_0055_01_000001 is sending heartbeat. Ordering it to DIE 2019-11-01 11:46:03,271 INFO org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl: Removed completed containers from NM context: [container_1567133159094_0055_01_000001] 2019-11-01 11:46:03,745 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl: Stopping resource-monitoring for container_1567133159094_0055_01_000001 2019-11-01 11:46:03,817 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc: Unexpected exception while waiting for channel termination java.lang.InterruptedException at java.util.concurrent.locks.AbstractQueuedSynchronizer.tryAcquireSharedNanos(AbstractQueuedSynchronizer.java:1326) at java.util.concurrent.CountDownLatch.await(CountDownLatch.java:277) at org.apache.ratis.thirdparty.io.grpc.internal.ManagedChannelImpl.awaitTermination(ManagedChannelImpl.java:763) at org.apache.ratis.thirdparty.io.grpc.internal.ForwardingManagedChannel.awaitTermination(ForwardingManagedChannel.java:57) at org.apache.ratis.thirdparty.io.grpc.internal.ManagedChannelOrphanWrapper.awaitTermination(ManagedChannelOrphanWrapper.java:70) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.close(XceiverClientGrpc.java:201) at org.apache.hadoop.hdds.scm.XceiverClientSpi.cleanup(XceiverClientSpi.java:68) at org.apache.hadoop.hdds.scm.XceiverClientSpi.setEvicted(XceiverClientSpi.java:60) at org.apache.hadoop.hdds.scm.XceiverClientManager$1.onRemoval(XceiverClientManager.java:102) at com.google.common.cache.LocalCache.processPendingNotifications(LocalCache.java:2004) at com.google.common.cache.LocalCache$Segment.runUnlockedCleanup(LocalCache.java:3490) at com.google.common.cache.LocalCache$Segment.postWriteCleanup(LocalCache.java:3466) at com.google.common.cache.LocalCache$Segment.clear(LocalCache.java:3260) at com.google.common.cache.LocalCache.clear(LocalCache.java:4240) at com.google.common.cache.LocalCache$LocalManualCache.invalidateAll(LocalCache.java:4795) at org.apache.hadoop.hdds.scm.XceiverClientManager.close(XceiverClientManager.java:252) at org.apache.hadoop.io.IOUtils.cleanupWithLogger(IOUtils.java:280) at org.apache.hadoop.ozone.client.rpc.RpcClient.close(RpcClient.java:785) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.hadoop.ozone.client.OzoneClientInvocationHandler.invoke(OzoneClientInvocationHandler.java:54) at com.sun.proxy.$Proxy1024.close(Unknown Source) at org.apache.hadoop.ozone.client.OzoneClient.close(OzoneClient.java:108) at org.apache.hadoop.fs.ozone.BasicOzoneClientAdapterImpl.close(BasicOzoneClientAdapterImpl.java:152) at org.apache.hadoop.fs.ozone.BasicOzoneFileSystem.close(BasicOzoneFileSystem.java:192) at org.apache.hadoop.fs.FileSystem$Cache.closeAll(FileSystem.java:2797) at org.apache.hadoop.fs.FileSystem.closeAllForUGI(FileSystem.java:459) at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.closeFileSystems(ContainerLocalizer.java:223) at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.runLocalization(ContainerLocalizer.java:183) at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.startLocalizer(DefaultContainerExecutor.java:130) at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService$LocalizerRunner.run(ResourceLocalizationService.java:1117) 2019-11-01 11:46:04,271 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl: Application application_1567133159094_0055 transitioned from RUNNING to APPLICATION_RESOURCES_CLEANINGUP 2019-11-01 11:46:04,271 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices: Got event APPLICATION_STOP for appId application_1567133159094_0055 2019-11-01 11:46:04,271 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl: Application application_1567133159094_0055 transitioned from APPLICATION_RESOURCES_CLEANINGUP to FINISHED was (Author: sammi): The root cause is I didn't retart Hadoop 2.7.5 after I deploied the latest Ozone binary. So the Hadoop still use an old version Ozone client(2 month before) . This OzoneChecksumException is thrown out by NodeManager. Logs attached. It seems something is changed in Ozone server side, which makes an old version Ozone client cann't verify the data written by itself. 2019-11-01 11:46:02,230 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc: Failed to execute command cmdType: ReadChunk traceID: "" containerID: 1145 datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb" readChunk { blockID { containerID: 1145 localID: 103060600027086850 blockCommitSequenceId: 948 } chunkData { chunkName: "103060600027086850_chunk_1" offset: 0 len: 245 checksumData { type: CRC32 bytesPerChecksum: 1048576 checksums: "\247\304Yf" } } } on datanode 1da74a1d-f64d-4ad4-b04c-85f26687e683 org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at index 0 at org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233) at org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) at org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) at org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) at org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52) at java.io.DataInputStream.read(DataInputStream.java:100) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120) at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366) at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267) at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2019-11-01 11:46:02,243 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc: Failed to execute command cmdType: ReadChunk traceID: "" containerID: 1145 datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb" readChunk { blockID { containerID: 1145 localID: 103060600027086850 blockCommitSequenceId: 948 } chunkData { chunkName: "103060600027086850_chunk_1" offset: 0 len: 245 checksumData { type: CRC32 bytesPerChecksum: 1048576 checksums: "\247\304Yf" } } } on datanode ed90869c-317e-4303-8922-9fa83a3983cb org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at index 0 at org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233) at org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) at org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) at org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) at org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52) at java.io.DataInputStream.read(DataInputStream.java:100) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120) at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366) at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267) at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2019-11-01 11:46:02,262 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc: Failed to execute command cmdType: ReadChunk traceID: "" containerID: 1145 datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb" readChunk { blockID { containerID: 1145 localID: 103060600027086850 blockCommitSequenceId: 948 } chunkData { chunkName: "103060600027086850_chunk_1" offset: 0 len: 245 checksumData { type: CRC32 bytesPerChecksum: 1048576 checksums: "\247\304Yf" } } } on datanode b65b0b6c-b0bb-429f-a23d-467c72d4b85c org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at index 0 at org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233) at org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) at org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) at org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) at org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52) at java.io.DataInputStream.read(DataInputStream.java:100) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120) at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366) at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267) at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2019-11-01 11:46:02,263 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc: Failed to execute command cmdType: ReadChunk traceID: "" containerID: 1145 datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb" readChunk { blockID { containerID: 1145 localID: 103060600027086850 blockCommitSequenceId: 948 } chunkData { chunkName: "103060600027086850_chunk_1" offset: 0 len: 245 checksumData { type: CRC32 bytesPerChecksum: 1048576 checksums: "\247\304Yf" } } } on the pipeline Pipeline[ Id: 37b13907-60ea-4101-a7d9-6f02ebaf52f6, Nodes: ed90869c-317e-4303-8922-9fa83a3983cb{ip: 10.120.113.172, host: host172, networkLocation: /rack2, certSerialId: null}1da74a1d-f64d-4ad4-b04c-85f26687e683{ip: 10.121.124.44, host: host044, networkLocation: /rack2, certSerialId: null}b65b0b6c-b0bb-429f-a23d-467c72d4b85c{ip: 10.120.139.111, host: host111, networkLocation: /rack1, certSerialId: null}, Type:STAND_ALONE, Factor:THREE, State:OPEN]. 2019-11-01 11:46:02,266 WARN org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService: { o3fs://bucket.hadoop/tmp/hadoop-yarn/staging/root/.staging/job_1567133159094_0055/job.splitmetainfo, 1572579956554, FILE, null } failed: Unexpected OzoneException: org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at index 0 java.io.IOException: Unexpected OzoneException: org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at index 0 at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:342) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) at org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) at org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) at org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52) at java.io.DataInputStream.read(DataInputStream.java:100) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120) at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366) at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267) at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at index 0 at org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233) at org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335) ... 26 more Caused by: Checksum mismatch at index 0 org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at index 0 at org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275) at org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233) at org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) at org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) at org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) at org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) at org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52) at java.io.DataInputStream.read(DataInputStream.java:100) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60) at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120) at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366) at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267) at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361) at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359) at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2019-11-01 11:46:02,266 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.LocalizedResource: Resource o3fs://bucket.hadoop/tmp/hadoop-yarn/staging/root/.staging/job_1567133159094_0055/job.splitmetainfo(->/data1/usercache/root/appcache/application_1567133159094_0055/filecache/10/job.splitmetainfo) transitioned from DOWNLOADING to FAILED 2019-11-01 11:46:02,266 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl: Container container_1567133159094_0055_01_000001 transitioned from LOCALIZING to LOCALIZATION_FAILED 2019-11-01 11:46:02,266 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.LocalResourcesTrackerImpl: Container container_1567133159094_0055_01_000001 sent RELEASE event on a resource request { o3fs://bucket.hadoop/tmp/hadoop-yarn/staging/root/.staging/job_1567133159094_0055/job.splitmetainfo, 1572579956554, FILE, null } not present in cache. 2019-11-01 11:46:02,266 WARN org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger: USER=root OPERATION=Container Finished - Failed TARGET=ContainerImpl RESULT=FAILURE DESCRIPTION=Container failed with state: LOCALIZATION_FAILED APPID=application_1567133159094_0055 CONTAINERID=container_1567133159094_0055_01_000001 2019-11-01 11:46:02,268 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl: Container container_1567133159094_0055_01_000001 transitioned from LOCALIZATION_FAILED to DONE 2019-11-01 11:46:02,268 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl: Removing container_1567133159094_0055_01_000001 from application application_1567133159094_0055 2019-11-01 11:46:02,268 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices: Got event CONTAINER_STOP for appId application_1567133159094_0055 2019-11-01 11:46:02,270 WARN org.apache.hadoop.ipc.Client: interrupted waiting to send rpc request to server java.lang.InterruptedException at java.util.concurrent.FutureTask.awaitDone(FutureTask.java:404) at java.util.concurrent.FutureTask.get(FutureTask.java:191) at org.apache.hadoop.ipc.Client$Connection.sendRpcRequest(Client.java:1060) at org.apache.hadoop.ipc.Client.call(Client.java:1455) at org.apache.hadoop.ipc.Client.call(Client.java:1413) at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:229) at com.sun.proxy.$Proxy75.heartbeat(Unknown Source) at org.apache.hadoop.yarn.server.nodemanager.api.impl.pb.client.LocalizationProtocolPBClientImpl.heartbeat(LocalizationProtocolPBClientImpl.java:63) at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.localizeFiles(ContainerLocalizer.java:255) at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.runLocalization(ContainerLocalizer.java:169) at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.startLocalizer(DefaultContainerExecutor.java:130) at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService$LocalizerRunner.run(ResourceLocalizationService.java:1117) 2019-11-01 11:46:02,270 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService: Unknown localizer with localizerId container_1567133159094_0055_01_000001 is sending heartbeat. Ordering it to DIE 2019-11-01 11:46:03,271 INFO org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl: Removed completed containers from NM context: [container_1567133159094_0055_01_000001] 2019-11-01 11:46:03,745 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl: Stopping resource-monitoring for container_1567133159094_0055_01_000001 2019-11-01 11:46:03,817 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc: Unexpected exception while waiting for channel termination java.lang.InterruptedException at java.util.concurrent.locks.AbstractQueuedSynchronizer.tryAcquireSharedNanos(AbstractQueuedSynchronizer.java:1326) at java.util.concurrent.CountDownLatch.await(CountDownLatch.java:277) at org.apache.ratis.thirdparty.io.grpc.internal.ManagedChannelImpl.awaitTermination(ManagedChannelImpl.java:763) at org.apache.ratis.thirdparty.io.grpc.internal.ForwardingManagedChannel.awaitTermination(ForwardingManagedChannel.java:57) at org.apache.ratis.thirdparty.io.grpc.internal.ManagedChannelOrphanWrapper.awaitTermination(ManagedChannelOrphanWrapper.java:70) at org.apache.hadoop.hdds.scm.XceiverClientGrpc.close(XceiverClientGrpc.java:201) at org.apache.hadoop.hdds.scm.XceiverClientSpi.cleanup(XceiverClientSpi.java:68) at org.apache.hadoop.hdds.scm.XceiverClientSpi.setEvicted(XceiverClientSpi.java:60) at org.apache.hadoop.hdds.scm.XceiverClientManager$1.onRemoval(XceiverClientManager.java:102) at com.google.common.cache.LocalCache.processPendingNotifications(LocalCache.java:2004) at com.google.common.cache.LocalCache$Segment.runUnlockedCleanup(LocalCache.java:3490) at com.google.common.cache.LocalCache$Segment.postWriteCleanup(LocalCache.java:3466) at com.google.common.cache.LocalCache$Segment.clear(LocalCache.java:3260) at com.google.common.cache.LocalCache.clear(LocalCache.java:4240) at com.google.common.cache.LocalCache$LocalManualCache.invalidateAll(LocalCache.java:4795) at org.apache.hadoop.hdds.scm.XceiverClientManager.close(XceiverClientManager.java:252) at org.apache.hadoop.io.IOUtils.cleanupWithLogger(IOUtils.java:280) at org.apache.hadoop.ozone.client.rpc.RpcClient.close(RpcClient.java:785) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.hadoop.ozone.client.OzoneClientInvocationHandler.invoke(OzoneClientInvocationHandler.java:54) at com.sun.proxy.$Proxy1024.close(Unknown Source) at org.apache.hadoop.ozone.client.OzoneClient.close(OzoneClient.java:108) at org.apache.hadoop.fs.ozone.BasicOzoneClientAdapterImpl.close(BasicOzoneClientAdapterImpl.java:152) at org.apache.hadoop.fs.ozone.BasicOzoneFileSystem.close(BasicOzoneFileSystem.java:192) at org.apache.hadoop.fs.FileSystem$Cache.closeAll(FileSystem.java:2797) at org.apache.hadoop.fs.FileSystem.closeAllForUGI(FileSystem.java:459) at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.closeFileSystems(ContainerLocalizer.java:223) at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.runLocalization(ContainerLocalizer.java:183) at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.startLocalizer(DefaultContainerExecutor.java:130) at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService$LocalizerRunner.run(ResourceLocalizationService.java:1117) 2019-11-01 11:46:04,271 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl: Application application_1567133159094_0055 transitioned from RUNNING to APPLICATION_RESOURCES_CLEANINGUP 2019-11-01 11:46:04,271 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices: Got event APPLICATION_STOP for appId application_1567133159094_0055 2019-11-01 11:46:04,271 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl: Application application_1567133159094_0055 transitioned from APPLICATION_RESOURCES_CLEANINGUP to FINISHED > Fail to read data through XceiverClientGrpc > ------------------------------------------- > > Key: HDDS-2376 > URL: https://issues.apache.org/jira/browse/HDDS-2376 > Project: Hadoop Distributed Data Store > Issue Type: Bug > Reporter: Sammi Chen > Assignee: Hanisha Koneru > Priority: Blocker > > Run teragen, application failed with following stack, > 19/10/29 14:35:42 INFO mapreduce.Job: Running job: job_1567133159094_0048 > 19/10/29 14:35:59 INFO mapreduce.Job: Job job_1567133159094_0048 running in > uber mode : false > 19/10/29 14:35:59 INFO mapreduce.Job: map 0% reduce 0% > 19/10/29 14:35:59 INFO mapreduce.Job: Job job_1567133159094_0048 failed with > state FAILED due to: Application application_1567133159094_0048 failed 2 > times due to AM Container for appattempt_1567133159094_0048_000002 exited > with exitCode: -1000 > For more detailed output, check application tracking > page:http://host183:8088/cluster/app/application_1567133159094_0048Then, > click on links to logs of each attempt. > Diagnostics: Unexpected OzoneException: > org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at > index 0 > java.io.IOException: Unexpected OzoneException: > org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at > index 0 > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:342) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) > at > org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) > at > org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) > at > org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52) > at java.io.DataInputStream.read(DataInputStream.java:100) > at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86) > at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60) > at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120) > at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366) > at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267) > at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63) > at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361) > at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754) > at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359) > at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > Caused by: org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum > mismatch at index 0 > at > org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148) > at > org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275) > at > org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375) > at > org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287) > at > org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250) > at > org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233) > at > org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335) > ... 26 more > Caused by: Checksum mismatch at index 0 > org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at > index 0 > at > org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148) > at > org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275) > at > org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375) > at > org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287) > at > org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250) > at > org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233) > at > org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) > at > org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) > at > org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) > at > org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52) > at java.io.DataInputStream.read(DataInputStream.java:100) > at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86) > at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60) > at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120) > at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366) > at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267) > at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63) > at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361) > at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754) > at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359) > at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: hdfs-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: hdfs-issues-h...@hadoop.apache.org