[
https://issues.apache.org/jira/browse/HDDS-2376?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16964661#comment-16964661
]
Sammi Chen edited comment on HDDS-2376 at 11/1/19 6:56 AM:
-----------------------------------------------------------
The root cause is I didn't retart Hadoop 2.7.5 after I deploied the latest
Ozone binary. So the Hadoop still use an old version Ozone client(2 month
before) . This OzoneChecksumException is thrown out by NodeManager. Logs
attached. It seems something is changed in Ozone server side, which makes an
old version Ozone client cann't verify the data written by itself.
[~msingh] and [~hanishakoneru], thanks for pay attention to this issue. I will
close it now.
2019-11-01 11:46:02,230 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc:
Failed to execute command cmdType: ReadChunk
traceID: ""
containerID: 1145
datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb"
readChunk {
blockID {
containerID: 1145
localID: 103060600027086850
blockCommitSequenceId: 948
}
chunkData {
chunkName: "103060600027086850_chunk_1"
offset: 0
len: 245
checksumData {
type: CRC32
bytesPerChecksum: 1048576
checksums: "\247\304Yf"
}
}
}
on datanode 1da74a1d-f64d-4ad4-b04c-85f26687e683
org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
index 0
at
org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233)
at
org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239)
at
org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171)
at
org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52)
at java.io.DataInputStream.read(DataInputStream.java:100)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120)
at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366)
at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267)
at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
2019-11-01 11:46:02,243 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc:
Failed to execute command cmdType: ReadChunk
traceID: ""
containerID: 1145
datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb"
readChunk {
blockID {
containerID: 1145
localID: 103060600027086850
blockCommitSequenceId: 948
}
chunkData {
chunkName: "103060600027086850_chunk_1"
offset: 0
len: 245
checksumData {
type: CRC32
bytesPerChecksum: 1048576
checksums: "\247\304Yf"
}
}
}
on datanode ed90869c-317e-4303-8922-9fa83a3983cb
org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
index 0
at
org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233)
at
org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239)
at
org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171)
at
org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52)
at java.io.DataInputStream.read(DataInputStream.java:100)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120)
at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366)
at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267)
at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
2019-11-01 11:46:02,262 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc:
Failed to execute command cmdType: ReadChunk
traceID: ""
containerID: 1145
datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb"
readChunk {
blockID {
containerID: 1145
localID: 103060600027086850
blockCommitSequenceId: 948
}
chunkData {
chunkName: "103060600027086850_chunk_1"
offset: 0
len: 245
checksumData {
type: CRC32
bytesPerChecksum: 1048576
checksums: "\247\304Yf"
}
}
}
on datanode b65b0b6c-b0bb-429f-a23d-467c72d4b85c
org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
index 0
at
org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233)
at
org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239)
at
org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171)
at
org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52)
at java.io.DataInputStream.read(DataInputStream.java:100)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120)
at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366)
at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267)
at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
2019-11-01 11:46:02,263 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc:
Failed to execute command cmdType: ReadChunk
traceID: ""
containerID: 1145
datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb"
readChunk {
blockID {
containerID: 1145
localID: 103060600027086850
blockCommitSequenceId: 948
}
chunkData {
chunkName: "103060600027086850_chunk_1"
offset: 0
len: 245
checksumData {
type: CRC32
bytesPerChecksum: 1048576
checksums: "\247\304Yf"
}
}
}
on the pipeline Pipeline[ Id: 37b13907-60ea-4101-a7d9-6f02ebaf52f6, Nodes:
ed90869c-317e-4303-8922-9fa83a3983cb{ip: 10.120.113.172, host: host172,
networkLocation: /rack2, certSerialId:
null}1da74a1d-f64d-4ad4-b04c-85f26687e683{ip: 10.121.124.44, host: host044,
networkLocation: /rack2, certSerialId:
null}b65b0b6c-b0bb-429f-a23d-467c72d4b85c{ip: 10.120.139.111, host: host111,
networkLocation: /rack1, certSerialId: null}, Type:STAND_ALONE, Factor:THREE,
State:OPEN].
2019-11-01 11:46:02,266 WARN
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService:
{
o3fs://bucket.hadoop/tmp/hadoop-yarn/staging/root/.staging/job_1567133159094_0055/job.splitmetainfo,
1572579956554, FILE, null } failed: Unexpected OzoneException:
org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
index 0
java.io.IOException: Unexpected OzoneException:
org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
index 0
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:342)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239)
at
org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171)
at
org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52)
at java.io.DataInputStream.read(DataInputStream.java:100)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120)
at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366)
at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267)
at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum
mismatch at index 0
at
org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233)
at
org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335)
... 26 more
Caused by: Checksum mismatch at index 0
org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
index 0
at
org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233)
at
org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239)
at
org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171)
at
org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52)
at java.io.DataInputStream.read(DataInputStream.java:100)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120)
at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366)
at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267)
at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
2019-11-01 11:46:02,266 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.LocalizedResource:
Resource
o3fs://bucket.hadoop/tmp/hadoop-yarn/staging/root/.staging/job_1567133159094_0055/job.splitmetainfo(->/data1/usercache/root/appcache/application_1567133159094_0055/filecache/10/job.splitmetainfo)
transitioned from DOWNLOADING to FAILED
2019-11-01 11:46:02,266 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl:
Container container_1567133159094_0055_01_000001 transitioned from LOCALIZING
to LOCALIZATION_FAILED
2019-11-01 11:46:02,266 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.LocalResourcesTrackerImpl:
Container container_1567133159094_0055_01_000001 sent RELEASE event on a
resource request {
o3fs://bucket.hadoop/tmp/hadoop-yarn/staging/root/.staging/job_1567133159094_0055/job.splitmetainfo,
1572579956554, FILE, null } not present in cache.
2019-11-01 11:46:02,266 WARN
org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger: USER=root
OPERATION=Container Finished - Failed TARGET=ContainerImpl RESULT=FAILURE
DESCRIPTION=Container failed with state: LOCALIZATION_FAILED
APPID=application_1567133159094_0055
CONTAINERID=container_1567133159094_0055_01_000001
2019-11-01 11:46:02,268 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl:
Container container_1567133159094_0055_01_000001 transitioned from
LOCALIZATION_FAILED to DONE
2019-11-01 11:46:02,268 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl:
Removing container_1567133159094_0055_01_000001 from application
application_1567133159094_0055
2019-11-01 11:46:02,268 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices: Got
event CONTAINER_STOP for appId application_1567133159094_0055
2019-11-01 11:46:02,270 WARN org.apache.hadoop.ipc.Client: interrupted waiting
to send rpc request to server
java.lang.InterruptedException
at java.util.concurrent.FutureTask.awaitDone(FutureTask.java:404)
at java.util.concurrent.FutureTask.get(FutureTask.java:191)
at
org.apache.hadoop.ipc.Client$Connection.sendRpcRequest(Client.java:1060)
at org.apache.hadoop.ipc.Client.call(Client.java:1455)
at org.apache.hadoop.ipc.Client.call(Client.java:1413)
at
org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:229)
at com.sun.proxy.$Proxy75.heartbeat(Unknown Source)
at
org.apache.hadoop.yarn.server.nodemanager.api.impl.pb.client.LocalizationProtocolPBClientImpl.heartbeat(LocalizationProtocolPBClientImpl.java:63)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.localizeFiles(ContainerLocalizer.java:255)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.runLocalization(ContainerLocalizer.java:169)
at
org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.startLocalizer(DefaultContainerExecutor.java:130)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService$LocalizerRunner.run(ResourceLocalizationService.java:1117)
2019-11-01 11:46:02,270 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService:
Unknown localizer with localizerId container_1567133159094_0055_01_000001 is
sending heartbeat. Ordering it to DIE
2019-11-01 11:46:03,271 INFO
org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl: Removed
completed containers from NM context: [container_1567133159094_0055_01_000001]
2019-11-01 11:46:03,745 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl:
Stopping resource-monitoring for container_1567133159094_0055_01_000001
2019-11-01 11:46:03,817 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc:
Unexpected exception while waiting for channel termination
java.lang.InterruptedException
at
java.util.concurrent.locks.AbstractQueuedSynchronizer.tryAcquireSharedNanos(AbstractQueuedSynchronizer.java:1326)
at java.util.concurrent.CountDownLatch.await(CountDownLatch.java:277)
at
org.apache.ratis.thirdparty.io.grpc.internal.ManagedChannelImpl.awaitTermination(ManagedChannelImpl.java:763)
at
org.apache.ratis.thirdparty.io.grpc.internal.ForwardingManagedChannel.awaitTermination(ForwardingManagedChannel.java:57)
at
org.apache.ratis.thirdparty.io.grpc.internal.ManagedChannelOrphanWrapper.awaitTermination(ManagedChannelOrphanWrapper.java:70)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.close(XceiverClientGrpc.java:201)
at
org.apache.hadoop.hdds.scm.XceiverClientSpi.cleanup(XceiverClientSpi.java:68)
at
org.apache.hadoop.hdds.scm.XceiverClientSpi.setEvicted(XceiverClientSpi.java:60)
at
org.apache.hadoop.hdds.scm.XceiverClientManager$1.onRemoval(XceiverClientManager.java:102)
at
com.google.common.cache.LocalCache.processPendingNotifications(LocalCache.java:2004)
at
com.google.common.cache.LocalCache$Segment.runUnlockedCleanup(LocalCache.java:3490)
at
com.google.common.cache.LocalCache$Segment.postWriteCleanup(LocalCache.java:3466)
at
com.google.common.cache.LocalCache$Segment.clear(LocalCache.java:3260)
at com.google.common.cache.LocalCache.clear(LocalCache.java:4240)
at
com.google.common.cache.LocalCache$LocalManualCache.invalidateAll(LocalCache.java:4795)
at
org.apache.hadoop.hdds.scm.XceiverClientManager.close(XceiverClientManager.java:252)
at org.apache.hadoop.io.IOUtils.cleanupWithLogger(IOUtils.java:280)
at
org.apache.hadoop.ozone.client.rpc.RpcClient.close(RpcClient.java:785)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
org.apache.hadoop.ozone.client.OzoneClientInvocationHandler.invoke(OzoneClientInvocationHandler.java:54)
at com.sun.proxy.$Proxy1024.close(Unknown Source)
at
org.apache.hadoop.ozone.client.OzoneClient.close(OzoneClient.java:108)
at
org.apache.hadoop.fs.ozone.BasicOzoneClientAdapterImpl.close(BasicOzoneClientAdapterImpl.java:152)
at
org.apache.hadoop.fs.ozone.BasicOzoneFileSystem.close(BasicOzoneFileSystem.java:192)
at org.apache.hadoop.fs.FileSystem$Cache.closeAll(FileSystem.java:2797)
at org.apache.hadoop.fs.FileSystem.closeAllForUGI(FileSystem.java:459)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.closeFileSystems(ContainerLocalizer.java:223)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.runLocalization(ContainerLocalizer.java:183)
at
org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.startLocalizer(DefaultContainerExecutor.java:130)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService$LocalizerRunner.run(ResourceLocalizationService.java:1117)
2019-11-01 11:46:04,271 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl:
Application application_1567133159094_0055 transitioned from RUNNING to
APPLICATION_RESOURCES_CLEANINGUP
2019-11-01 11:46:04,271 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices: Got
event APPLICATION_STOP for appId application_1567133159094_0055
2019-11-01 11:46:04,271 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl:
Application application_1567133159094_0055 transitioned from
APPLICATION_RESOURCES_CLEANINGUP to FINISHED
was (Author: sammi):
The root cause is I didn't retart Hadoop 2.7.5 after I deploied the latest
Ozone binary. So the Hadoop still use an old version Ozone client(2 month
before) . This OzoneChecksumException is thrown out by NodeManager. Logs
attached. It seems something is changed in Ozone server side, which makes an
old version Ozone client cann't verify the data written by itself.
2019-11-01 11:46:02,230 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc:
Failed to execute command cmdType: ReadChunk
traceID: ""
containerID: 1145
datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb"
readChunk {
blockID {
containerID: 1145
localID: 103060600027086850
blockCommitSequenceId: 948
}
chunkData {
chunkName: "103060600027086850_chunk_1"
offset: 0
len: 245
checksumData {
type: CRC32
bytesPerChecksum: 1048576
checksums: "\247\304Yf"
}
}
}
on datanode 1da74a1d-f64d-4ad4-b04c-85f26687e683
org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
index 0
at
org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233)
at
org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239)
at
org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171)
at
org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52)
at java.io.DataInputStream.read(DataInputStream.java:100)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120)
at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366)
at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267)
at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
2019-11-01 11:46:02,243 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc:
Failed to execute command cmdType: ReadChunk
traceID: ""
containerID: 1145
datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb"
readChunk {
blockID {
containerID: 1145
localID: 103060600027086850
blockCommitSequenceId: 948
}
chunkData {
chunkName: "103060600027086850_chunk_1"
offset: 0
len: 245
checksumData {
type: CRC32
bytesPerChecksum: 1048576
checksums: "\247\304Yf"
}
}
}
on datanode ed90869c-317e-4303-8922-9fa83a3983cb
org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
index 0
at
org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233)
at
org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239)
at
org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171)
at
org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52)
at java.io.DataInputStream.read(DataInputStream.java:100)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120)
at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366)
at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267)
at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
2019-11-01 11:46:02,262 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc:
Failed to execute command cmdType: ReadChunk
traceID: ""
containerID: 1145
datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb"
readChunk {
blockID {
containerID: 1145
localID: 103060600027086850
blockCommitSequenceId: 948
}
chunkData {
chunkName: "103060600027086850_chunk_1"
offset: 0
len: 245
checksumData {
type: CRC32
bytesPerChecksum: 1048576
checksums: "\247\304Yf"
}
}
}
on datanode b65b0b6c-b0bb-429f-a23d-467c72d4b85c
org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
index 0
at
org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233)
at
org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239)
at
org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171)
at
org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52)
at java.io.DataInputStream.read(DataInputStream.java:100)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120)
at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366)
at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267)
at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
2019-11-01 11:46:02,263 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc:
Failed to execute command cmdType: ReadChunk
traceID: ""
containerID: 1145
datanodeUuid: "ed90869c-317e-4303-8922-9fa83a3983cb"
readChunk {
blockID {
containerID: 1145
localID: 103060600027086850
blockCommitSequenceId: 948
}
chunkData {
chunkName: "103060600027086850_chunk_1"
offset: 0
len: 245
checksumData {
type: CRC32
bytesPerChecksum: 1048576
checksums: "\247\304Yf"
}
}
}
on the pipeline Pipeline[ Id: 37b13907-60ea-4101-a7d9-6f02ebaf52f6, Nodes:
ed90869c-317e-4303-8922-9fa83a3983cb{ip: 10.120.113.172, host: host172,
networkLocation: /rack2, certSerialId:
null}1da74a1d-f64d-4ad4-b04c-85f26687e683{ip: 10.121.124.44, host: host044,
networkLocation: /rack2, certSerialId:
null}b65b0b6c-b0bb-429f-a23d-467c72d4b85c{ip: 10.120.139.111, host: host111,
networkLocation: /rack1, certSerialId: null}, Type:STAND_ALONE, Factor:THREE,
State:OPEN].
2019-11-01 11:46:02,266 WARN
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService:
{
o3fs://bucket.hadoop/tmp/hadoop-yarn/staging/root/.staging/job_1567133159094_0055/job.splitmetainfo,
1572579956554, FILE, null } failed: Unexpected OzoneException:
org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
index 0
java.io.IOException: Unexpected OzoneException:
org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
index 0
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:342)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239)
at
org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171)
at
org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52)
at java.io.DataInputStream.read(DataInputStream.java:100)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120)
at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366)
at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267)
at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum
mismatch at index 0
at
org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233)
at
org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335)
... 26 more
Caused by: Checksum mismatch at index 0
org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
index 0
at
org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275)
at
org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233)
at
org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259)
at
org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239)
at
org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171)
at
org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52)
at java.io.DataInputStream.read(DataInputStream.java:100)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120)
at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366)
at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267)
at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361)
at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359)
at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
2019-11-01 11:46:02,266 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.LocalizedResource:
Resource
o3fs://bucket.hadoop/tmp/hadoop-yarn/staging/root/.staging/job_1567133159094_0055/job.splitmetainfo(->/data1/usercache/root/appcache/application_1567133159094_0055/filecache/10/job.splitmetainfo)
transitioned from DOWNLOADING to FAILED
2019-11-01 11:46:02,266 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl:
Container container_1567133159094_0055_01_000001 transitioned from LOCALIZING
to LOCALIZATION_FAILED
2019-11-01 11:46:02,266 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.LocalResourcesTrackerImpl:
Container container_1567133159094_0055_01_000001 sent RELEASE event on a
resource request {
o3fs://bucket.hadoop/tmp/hadoop-yarn/staging/root/.staging/job_1567133159094_0055/job.splitmetainfo,
1572579956554, FILE, null } not present in cache.
2019-11-01 11:46:02,266 WARN
org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger: USER=root
OPERATION=Container Finished - Failed TARGET=ContainerImpl RESULT=FAILURE
DESCRIPTION=Container failed with state: LOCALIZATION_FAILED
APPID=application_1567133159094_0055
CONTAINERID=container_1567133159094_0055_01_000001
2019-11-01 11:46:02,268 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl:
Container container_1567133159094_0055_01_000001 transitioned from
LOCALIZATION_FAILED to DONE
2019-11-01 11:46:02,268 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl:
Removing container_1567133159094_0055_01_000001 from application
application_1567133159094_0055
2019-11-01 11:46:02,268 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices: Got
event CONTAINER_STOP for appId application_1567133159094_0055
2019-11-01 11:46:02,270 WARN org.apache.hadoop.ipc.Client: interrupted waiting
to send rpc request to server
java.lang.InterruptedException
at java.util.concurrent.FutureTask.awaitDone(FutureTask.java:404)
at java.util.concurrent.FutureTask.get(FutureTask.java:191)
at
org.apache.hadoop.ipc.Client$Connection.sendRpcRequest(Client.java:1060)
at org.apache.hadoop.ipc.Client.call(Client.java:1455)
at org.apache.hadoop.ipc.Client.call(Client.java:1413)
at
org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:229)
at com.sun.proxy.$Proxy75.heartbeat(Unknown Source)
at
org.apache.hadoop.yarn.server.nodemanager.api.impl.pb.client.LocalizationProtocolPBClientImpl.heartbeat(LocalizationProtocolPBClientImpl.java:63)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.localizeFiles(ContainerLocalizer.java:255)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.runLocalization(ContainerLocalizer.java:169)
at
org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.startLocalizer(DefaultContainerExecutor.java:130)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService$LocalizerRunner.run(ResourceLocalizationService.java:1117)
2019-11-01 11:46:02,270 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService:
Unknown localizer with localizerId container_1567133159094_0055_01_000001 is
sending heartbeat. Ordering it to DIE
2019-11-01 11:46:03,271 INFO
org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl: Removed
completed containers from NM context: [container_1567133159094_0055_01_000001]
2019-11-01 11:46:03,745 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl:
Stopping resource-monitoring for container_1567133159094_0055_01_000001
2019-11-01 11:46:03,817 ERROR org.apache.hadoop.hdds.scm.XceiverClientGrpc:
Unexpected exception while waiting for channel termination
java.lang.InterruptedException
at
java.util.concurrent.locks.AbstractQueuedSynchronizer.tryAcquireSharedNanos(AbstractQueuedSynchronizer.java:1326)
at java.util.concurrent.CountDownLatch.await(CountDownLatch.java:277)
at
org.apache.ratis.thirdparty.io.grpc.internal.ManagedChannelImpl.awaitTermination(ManagedChannelImpl.java:763)
at
org.apache.ratis.thirdparty.io.grpc.internal.ForwardingManagedChannel.awaitTermination(ForwardingManagedChannel.java:57)
at
org.apache.ratis.thirdparty.io.grpc.internal.ManagedChannelOrphanWrapper.awaitTermination(ManagedChannelOrphanWrapper.java:70)
at
org.apache.hadoop.hdds.scm.XceiverClientGrpc.close(XceiverClientGrpc.java:201)
at
org.apache.hadoop.hdds.scm.XceiverClientSpi.cleanup(XceiverClientSpi.java:68)
at
org.apache.hadoop.hdds.scm.XceiverClientSpi.setEvicted(XceiverClientSpi.java:60)
at
org.apache.hadoop.hdds.scm.XceiverClientManager$1.onRemoval(XceiverClientManager.java:102)
at
com.google.common.cache.LocalCache.processPendingNotifications(LocalCache.java:2004)
at
com.google.common.cache.LocalCache$Segment.runUnlockedCleanup(LocalCache.java:3490)
at
com.google.common.cache.LocalCache$Segment.postWriteCleanup(LocalCache.java:3466)
at
com.google.common.cache.LocalCache$Segment.clear(LocalCache.java:3260)
at com.google.common.cache.LocalCache.clear(LocalCache.java:4240)
at
com.google.common.cache.LocalCache$LocalManualCache.invalidateAll(LocalCache.java:4795)
at
org.apache.hadoop.hdds.scm.XceiverClientManager.close(XceiverClientManager.java:252)
at org.apache.hadoop.io.IOUtils.cleanupWithLogger(IOUtils.java:280)
at
org.apache.hadoop.ozone.client.rpc.RpcClient.close(RpcClient.java:785)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
org.apache.hadoop.ozone.client.OzoneClientInvocationHandler.invoke(OzoneClientInvocationHandler.java:54)
at com.sun.proxy.$Proxy1024.close(Unknown Source)
at
org.apache.hadoop.ozone.client.OzoneClient.close(OzoneClient.java:108)
at
org.apache.hadoop.fs.ozone.BasicOzoneClientAdapterImpl.close(BasicOzoneClientAdapterImpl.java:152)
at
org.apache.hadoop.fs.ozone.BasicOzoneFileSystem.close(BasicOzoneFileSystem.java:192)
at org.apache.hadoop.fs.FileSystem$Cache.closeAll(FileSystem.java:2797)
at org.apache.hadoop.fs.FileSystem.closeAllForUGI(FileSystem.java:459)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.closeFileSystems(ContainerLocalizer.java:223)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.runLocalization(ContainerLocalizer.java:183)
at
org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.startLocalizer(DefaultContainerExecutor.java:130)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService$LocalizerRunner.run(ResourceLocalizationService.java:1117)
2019-11-01 11:46:04,271 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl:
Application application_1567133159094_0055 transitioned from RUNNING to
APPLICATION_RESOURCES_CLEANINGUP
2019-11-01 11:46:04,271 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices: Got
event APPLICATION_STOP for appId application_1567133159094_0055
2019-11-01 11:46:04,271 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl:
Application application_1567133159094_0055 transitioned from
APPLICATION_RESOURCES_CLEANINGUP to FINISHED
> Fail to read data through XceiverClientGrpc
> -------------------------------------------
>
> Key: HDDS-2376
> URL: https://issues.apache.org/jira/browse/HDDS-2376
> Project: Hadoop Distributed Data Store
> Issue Type: Bug
> Reporter: Sammi Chen
> Assignee: Hanisha Koneru
> Priority: Blocker
>
> Run teragen, application failed with following stack,
> 19/10/29 14:35:42 INFO mapreduce.Job: Running job: job_1567133159094_0048
> 19/10/29 14:35:59 INFO mapreduce.Job: Job job_1567133159094_0048 running in
> uber mode : false
> 19/10/29 14:35:59 INFO mapreduce.Job: map 0% reduce 0%
> 19/10/29 14:35:59 INFO mapreduce.Job: Job job_1567133159094_0048 failed with
> state FAILED due to: Application application_1567133159094_0048 failed 2
> times due to AM Container for appattempt_1567133159094_0048_000002 exited
> with exitCode: -1000
> For more detailed output, check application tracking
> page:http://host183:8088/cluster/app/application_1567133159094_0048Then,
> click on links to logs of each attempt.
> Diagnostics: Unexpected OzoneException:
> org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
> index 0
> java.io.IOException: Unexpected OzoneException:
> org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
> index 0
> at
> org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:342)
> at
> org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307)
> at
> org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259)
> at
> org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144)
> at
> org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239)
> at
> org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171)
> at
> org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52)
> at java.io.DataInputStream.read(DataInputStream.java:100)
> at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86)
> at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60)
> at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120)
> at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366)
> at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267)
> at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63)
> at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361)
> at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359)
> at java.security.AccessController.doPrivileged(Native Method)
> at javax.security.auth.Subject.doAs(Subject.java:422)
> at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
> at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359)
> at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum
> mismatch at index 0
> at
> org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148)
> at
> org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275)
> at
> org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238)
> at
> org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375)
> at
> org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287)
> at
> org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250)
> at
> org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233)
> at
> org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245)
> at
> org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335)
> ... 26 more
> Caused by: Checksum mismatch at index 0
> org.apache.hadoop.ozone.common.OzoneChecksumException: Checksum mismatch at
> index 0
> at
> org.apache.hadoop.ozone.common.ChecksumData.verifyChecksumDataMatches(ChecksumData.java:148)
> at
> org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:275)
> at
> org.apache.hadoop.ozone.common.Checksum.verifyChecksum(Checksum.java:238)
> at
> org.apache.hadoop.hdds.scm.storage.ChunkInputStream.lambda$new$0(ChunkInputStream.java:375)
> at
> org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithRetry(XceiverClientGrpc.java:287)
> at
> org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommandWithTraceIDAndRetry(XceiverClientGrpc.java:250)
> at
> org.apache.hadoop.hdds.scm.XceiverClientGrpc.sendCommand(XceiverClientGrpc.java:233)
> at
> org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.readChunk(ContainerProtocolCalls.java:245)
> at
> org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:335)
> at
> org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307)
> at
> org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259)
> at
> org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144)
> at
> org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239)
> at
> org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171)
> at
> org.apache.hadoop.fs.ozone.OzoneFSInputStream.read(OzoneFSInputStream.java:52)
> at java.io.DataInputStream.read(DataInputStream.java:100)
> at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:86)
> at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:60)
> at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:120)
> at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:366)
> at org.apache.hadoop.yarn.util.FSDownload.copy(FSDownload.java:267)
> at org.apache.hadoop.yarn.util.FSDownload.access$000(FSDownload.java:63)
> at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:361)
> at org.apache.hadoop.yarn.util.FSDownload$2.run(FSDownload.java:359)
> at java.security.AccessController.doPrivileged(Native Method)
> at javax.security.auth.Subject.doAs(Subject.java:422)
> at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
> at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:359)
> at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:62)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]