[jira] [Commented] (HDDS-1809) Ozone Read fails with StatusRunTimeExceptions after 2 datanode fail in Ratis pipeline
[ https://issues.apache.org/jira/browse/HDDS-1809?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16890729#comment-16890729 ] Sammi Chen commented on HDDS-1809: -- Thanks [~shashikant] for report this issue. It has been fixed by the code change in HDDS-1713. The root cause is previously network topology use Ipaddress as the node key in topology cluster, which results that three sorted Datanodes are the same node. Now datanode uuid is used as the node key in topology cluster, so sorted Datanodes will be three different nodes now. > Ozone Read fails with StatusRunTimeExceptions after 2 datanode fail in Ratis > pipeline > - > > Key: HDDS-1809 > URL: https://issues.apache.org/jira/browse/HDDS-1809 > Project: Hadoop Distributed Data Store > Issue Type: Bug > Components: Ozone Client >Affects Versions: 0.5.0 >Reporter: Shashikant Banerjee >Assignee: Sammi Chen >Priority: Major > Fix For: 0.5.0 > > > {code:java} > java.io.IOException: Unexpected OzoneException: java.io.IOException: > java.util.concurrent.ExecutionException: > org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNAVAILABLE: io > exception > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:342) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) > at > org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) > at > org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) > at > org.apache.hadoop.ozone.client.io.OzoneInputStream.read(OzoneInputStream.java:47) > at java.io.InputStream.read(InputStream.java:101) > at > org.apache.hadoop.ozone.container.ContainerTestHelper.validateData(ContainerTestHelper.java:709) > at > org.apache.hadoop.ozone.client.rpc.TestFailureHandlingByClient.validateData(TestFailureHandlingByClient.java:458) > at > org.apache.hadoop.ozone.client.rpc.TestFailureHandlingByClient.testBlockWritesWithDnFailures(TestFailureHandlingByClient.java:158) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:47) > at > org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) > at > org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:44) > at > org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) > at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:271) > at > org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:70) > at > org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:50) > at org.junit.runners.ParentRunner$3.run(ParentRunner.java:238) > at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:63) > at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:236) > at org.junit.runners.ParentRunner.access$000(ParentRunner.java:53) > at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:229) > at org.junit.runners.ParentRunner.run(ParentRunner.java:309) > at org.junit.runner.JUnitCore.run(JUnitCore.java:160) > at > com.intellij.junit4.JUnit4IdeaTestRunner.startRunnerWithArgs(JUnit4IdeaTestRunner.java:68) > at > com.intellij.rt.execution.junit.IdeaTestRunner$Repeater.startRunnerWithArgs(IdeaTestRunner.java:47) > at > com.intellij.rt.execution.junit.JUnitStarter.prepareStreamsAndStart(JUnitStarter.java:242) > at com.intellij.rt.execution.junit.JUnitStarter.main(JUnitStarter.java:70) > {code} -- This message was sent by Atlassian JIRA (v7.6.14#76016) - To unsubscribe, e-mail: hdfs-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: hdfs-issues-h...@hadoop.apache.org
[jira] [Commented] (HDDS-1809) Ozone Read fails with StatusRunTimeExceptions after 2 datanode fail in Ratis pipeline
[ https://issues.apache.org/jira/browse/HDDS-1809?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16886668#comment-16886668 ] Mukul Kumar Singh commented on HDDS-1809: - cc: [~xyao] > Ozone Read fails with StatusRunTimeExceptions after 2 datanode fail in Ratis > pipeline > - > > Key: HDDS-1809 > URL: https://issues.apache.org/jira/browse/HDDS-1809 > Project: Hadoop Distributed Data Store > Issue Type: Bug > Components: Ozone Client >Affects Versions: 0.5.0 >Reporter: Shashikant Banerjee >Assignee: Sammi Chen >Priority: Major > Fix For: 0.5.0 > > > {code:java} > java.io.IOException: Unexpected OzoneException: java.io.IOException: > java.util.concurrent.ExecutionException: > org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNAVAILABLE: io > exception > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:342) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) > at > org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) > at > org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) > at > org.apache.hadoop.ozone.client.io.OzoneInputStream.read(OzoneInputStream.java:47) > at java.io.InputStream.read(InputStream.java:101) > at > org.apache.hadoop.ozone.container.ContainerTestHelper.validateData(ContainerTestHelper.java:709) > at > org.apache.hadoop.ozone.client.rpc.TestFailureHandlingByClient.validateData(TestFailureHandlingByClient.java:458) > at > org.apache.hadoop.ozone.client.rpc.TestFailureHandlingByClient.testBlockWritesWithDnFailures(TestFailureHandlingByClient.java:158) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:47) > at > org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) > at > org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:44) > at > org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) > at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:271) > at > org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:70) > at > org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:50) > at org.junit.runners.ParentRunner$3.run(ParentRunner.java:238) > at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:63) > at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:236) > at org.junit.runners.ParentRunner.access$000(ParentRunner.java:53) > at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:229) > at org.junit.runners.ParentRunner.run(ParentRunner.java:309) > at org.junit.runner.JUnitCore.run(JUnitCore.java:160) > at > com.intellij.junit4.JUnit4IdeaTestRunner.startRunnerWithArgs(JUnit4IdeaTestRunner.java:68) > at > com.intellij.rt.execution.junit.IdeaTestRunner$Repeater.startRunnerWithArgs(IdeaTestRunner.java:47) > at > com.intellij.rt.execution.junit.JUnitStarter.prepareStreamsAndStart(JUnitStarter.java:242) > at com.intellij.rt.execution.junit.JUnitStarter.main(JUnitStarter.java:70) > {code} -- This message was sent by Atlassian JIRA (v7.6.14#76016) - To unsubscribe, e-mail: hdfs-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: hdfs-issues-h...@hadoop.apache.org
[jira] [Commented] (HDDS-1809) Ozone Read fails with StatusRunTimeExceptions after 2 datanode fail in Ratis pipeline
[ https://issues.apache.org/jira/browse/HDDS-1809?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16886071#comment-16886071 ] Shashikant Banerjee commented on HDDS-1809: --- The issue is happening bcoz, while doing the read, with rackAwareness enabled, pipeline.getNodesInOrder call returns the same datanode added thrice in the datanodeList as shown below and hence, if a failure is encountered read is retried on the same dn. {code:java} if ((request.getCmdType() == ContainerProtos.Type.ReadChunk || request.getCmdType() == ContainerProtos.Type.GetSmallFile) && topologyAwareRead) { datanodeList = pipeline.getNodesInOrder(); } else { datanodeList = pipeline.getNodes(); } datanodeList [f4b0bdf3-66d4-452c-82af-8a570ac0aeb7{ip: 192.168.43.156, host: hw15685, networkLocation: /default-rack, certSerialId: null}, f4b0bdf3-66d4-452c-82af-8a570ac0aeb7{ip: 192.168.43.156, host: hw15685, networkLocation: /default-rack, certSerialId: null}, f4b0bdf3-66d4-452c-82af-8a570ac0aeb7{ip: 192.168.43.156, host: hw15685, networkLocation: /default-rack, certSerialId: null}] Pipeline[ Id: 865a2079-de8e-472c-baaa-5aa345ed5e57, Nodes: f4b0bdf3-66d4-452c-82af-8a570ac0aeb7{ip: 192.168.43.156, host: hw15685, networkLocation: /default-rack, certSerialId: null}14975e64-2564-433d-9b89-c295083a1161{ip: 192.168.43.156, host: hw15685, networkLocation: /default-rack, certSerialId: null}efc0749c-c7eb-4b73-a4b2-0abe553ca5e9{ip: 192.168.43.156, host: hw15685, networkLocation: /default-rack, certSerialId: null}, Type:STAND_ALONE, Factor:THREE, State:OPEN] {code} The read path works well with the Neworktopology feature turned off. > Ozone Read fails with StatusRunTimeExceptions after 2 datanode fail in Ratis > pipeline > - > > Key: HDDS-1809 > URL: https://issues.apache.org/jira/browse/HDDS-1809 > Project: Hadoop Distributed Data Store > Issue Type: Bug >Affects Versions: 0.5.0 >Reporter: Shashikant Banerjee >Priority: Major > Fix For: 0.5.0 > > > {code:java} > java.io.IOException: Unexpected OzoneException: java.io.IOException: > java.util.concurrent.ExecutionException: > org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNAVAILABLE: io > exception > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunk(ChunkInputStream.java:342) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.readChunkFromContainer(ChunkInputStream.java:307) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.prepareRead(ChunkInputStream.java:259) > at > org.apache.hadoop.hdds.scm.storage.ChunkInputStream.read(ChunkInputStream.java:144) > at > org.apache.hadoop.hdds.scm.storage.BlockInputStream.read(BlockInputStream.java:239) > at > org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:171) > at > org.apache.hadoop.ozone.client.io.OzoneInputStream.read(OzoneInputStream.java:47) > at java.io.InputStream.read(InputStream.java:101) > at > org.apache.hadoop.ozone.container.ContainerTestHelper.validateData(ContainerTestHelper.java:709) > at > org.apache.hadoop.ozone.client.rpc.TestFailureHandlingByClient.validateData(TestFailureHandlingByClient.java:458) > at > org.apache.hadoop.ozone.client.rpc.TestFailureHandlingByClient.testBlockWritesWithDnFailures(TestFailureHandlingByClient.java:158) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:47) > at > org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) > at > org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:44) > at > org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) > at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:271) > at > org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:70) > at > org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:50) > at org.junit.runners.ParentRunner$3.run(ParentRunner.java:238) > at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:63) > at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:236) > at org.junit.runners.ParentRunner.access$000(ParentRunner.java:53) > at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:229) > at org.junit.runners.ParentRunner.run(ParentRunner.java:309) > at org.junit.runner.JUnitCore.run(JUnitCore.java:160) > at >