[ 
https://issues.apache.org/jira/browse/HDDS-7264?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Jyotirmoy Sinha updated HDDS-7264:
----------------------------------
    Component/s: SCM

> [ozone-LR] FileSystemException Too many open files with fault injections
> ------------------------------------------------------------------------
>
>                 Key: HDDS-7264
>                 URL: https://issues.apache.org/jira/browse/HDDS-7264
>             Project: Apache Ozone
>          Issue Type: Bug
>          Components: SCM
>    Affects Versions: 1.2.0
>            Reporter: Jyotirmoy Sinha
>            Priority: Major
>
> Steps :
>  # Execute hive-write operation iteration on long running setup.
>  # Inject the following faults in the system (one at a time with an interval 
> of 60 sec) :
> a. datanode disk, process and network level faults
> b. scm disk, process and network level faults
> c. om process and network level faults
> Exceptions observed :
> {code:java}
> 2022-09-19 18:16:32,292 [ERROR] [TezChild] |scm.XceiverClientGrpc|: Failed to 
> execute command ReadChunk on the pipeline Pipeline[ Id: 
> caca4e29-9659-4d54-b2d8-85bdceffaca9, Nodes: 
> 35447513-2e57-472d-bfb4-4b85b00bdbda{ip: 10.17.207.24, host: 
> vc0114.halxg.cloudera.com, ports: [REPLICATION=9886, RATIS=9858, 
> RATIS_ADMIN=9857, RATIS_SERVER=9856, STANDALONE=9859], networkLocation: 
> /default, certSerialId: null, persistedOpState: IN_SERVICE, 
> persistedOpStateExpiryEpochSec: 0}31b30082-3c55-459b-b0a8-636d335f2ae1{ip: 
> 10.17.207.16, host: vc0106.halxg.cloudera.com, ports: [REPLICATION=9886, 
> RATIS=9858, RATIS_ADMIN=9857, RATIS_SERVER=9856, STANDALONE=9859], 
> networkLocation: /default, certSerialId: null, persistedOpState: IN_SERVICE, 
> persistedOpStateExpiryEpochSec: 0}fefca023-0424-4d7d-85b7-d051b92772a1{ip: 
> 10.17.207.17, host: vc0107.halxg.cloudera.com, ports: [REPLICATION=9886, 
> RATIS=9858, RATIS_ADMIN=9857, RATIS_SERVER=9856, STANDALONE=9859], 
> networkLocation: /default, certSerialId: null, persistedOpState: IN_SERVICE, 
> persistedOpStateExpiryEpochSec: 0}, ReplicationConfig: STANDALONE/THREE, 
> State:ALLOCATED, leaderId:, 
> CreationTimestamp2022-09-19T18:16:32.097-07:00[America/Los_Angeles]].
> 2022-09-19 18:16:32,292 [INFO] [TezChild] |storage.BlockInputStream|: Unable 
> to read information for block conID: 51001 locID: 109611004723385779 bcsId: 
> 12436 from pipeline PipelineID=caca4e29-9659-4d54-b2d8-85bdceffaca9: 
> java.nio.file.FileSystemException: 
> /mnt/data/2/hadoop-ozone/datanode/data/hdds/CID-3bb0e46d-0182-4ca0-8730-278ab71bf767/current/containerDir99/51001/chunks/109611004723385779.block:
>  Too many open files
> 2022-09-19 18:16:32,303 [WARN] [TezChild] |storage.BlockInputStream|: No new 
> pipeline for block conID: 51001 locID: 109611004723385779 bcsId: 12436
> 2022-09-19 18:16:32,314 [INFO] [TezChild] |tez.ObjectCache|: Releasing key: 
> hive_20220919181423_afc82272-207c-4be6-ac63-5cb2dd98282b_Map 1__MAP_PLAN__ 
> {code}
> {code:java}
> 2022-09-19 18:16:32,315 [ERROR] [TezChild] |tez.TezProcessor|: 
> java.lang.RuntimeException: java.io.IOException: 
> org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException:
>  java.nio.file.FileSystemException: 
> /mnt/data/2/hadoop-ozone/datanode/data/hdds/CID-3bb0e46d-0182-4ca0-8730-278ab71bf767/current/containerDir99/51001/chunks/109611004723385779.block:
>  Too many open files
>         at 
> org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.initNextRecordReader(TezGroupedSplitsInputFormat.java:206)
>         at 
> org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.<init>(TezGroupedSplitsInputFormat.java:145)
>         at 
> org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat.getRecordReader(TezGroupedSplitsInputFormat.java:111)
>         at 
> org.apache.tez.mapreduce.lib.MRReaderMapred.setupOldRecordReader(MRReaderMapred.java:156)
>         at 
> org.apache.tez.mapreduce.lib.MRReaderMapred.setSplit(MRReaderMapred.java:82)
>         at 
> org.apache.tez.mapreduce.input.MRInput.initFromEventInternal(MRInput.java:703)
>         at 
> org.apache.tez.mapreduce.input.MRInput.initFromEvent(MRInput.java:662)
>         at 
> org.apache.tez.mapreduce.input.MRInputLegacy.checkAndAwaitRecordReaderInitialization(MRInputLegacy.java:150)
>         at 
> org.apache.tez.mapreduce.input.MRInputLegacy.init(MRInputLegacy.java:114)
>         at 
> org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.getMRInput(MapRecordProcessor.java:543)
>         at 
> org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.init(MapRecordProcessor.java:189)
>         at 
> org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:268)
>         at 
> org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:252)
>         at 
> org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:374)
>         at 
> org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:75)
>         at 
> org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:62)
>         at java.security.AccessController.doPrivileged(Native Method)
>         at javax.security.auth.Subject.doAs(Subject.java:422)
>         at 
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1898)
>         at 
> org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:62)
>         at 
> org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:38)
>         at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
>         at 
> com.google.common.util.concurrent.TrustedListenableFutureTask$TrustedFutureInterruptibleTask.runInterruptibly(TrustedListenableFutureTask.java:125)
>         at 
> com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:69)
>         at 
> com.google.common.util.concurrent.TrustedListenableFutureTask.run(TrustedListenableFutureTask.java:78)
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>         at java.lang.Thread.run(Thread.java:748)
> Caused by: java.io.IOException: 
> org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException:
>  java.nio.file.FileSystemException: 
> /mnt/data/2/hadoop-ozone/datanode/data/hdds/CID-3bb0e46d-0182-4ca0-8730-278ab71bf767/current/containerDir99/51001/chunks/109611004723385779.block:
>  Too many open files
>         at 
> org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderCreationException(HiveIOExceptionHandlerChain.java:97)
>         at 
> org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(HiveIOExceptionHandlerUtil.java:57)
>         at 
> org.apache.hadoop.hive.ql.io.HiveInputFormat.getRecordReader(HiveInputFormat.java:434)
>         at 
> org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.initNextRecordReader(TezGroupedSplitsInputFormat.java:203)
>         ... 27 more {code}
> {code:java}
> 2022-09-19 18:16:31,864 [ERROR] [TezChild] |tez.TezProcessor|: 
> java.lang.RuntimeException: java.io.IOException: java.io.IOException: 
> Unexpected OzoneException: java.io.IOException: 
> java.util.concurrent.ExecutionException: 
> org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: 
> DEADLINE_EXCEEDED: ClientCall was cancelled at or after deadline. 
> [closed=[CANCELLED], committed=[remote_addr=10.17.207.17/10.17.207.17:9859]]
>         at 
> org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.initNextRecordReader(TezGroupedSplitsInputFormat.java:206)
>         at 
> org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.<init>(TezGroupedSplitsInputFormat.java:145)
>         at 
> org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat.getRecordReader(TezGroupedSplitsInputFormat.java:111)
>         at 
> org.apache.tez.mapreduce.lib.MRReaderMapred.setupOldRecordReader(MRReaderMapred.java:156)
>         at 
> org.apache.tez.mapreduce.lib.MRReaderMapred.setSplit(MRReaderMapred.java:82)
>         at 
> org.apache.tez.mapreduce.input.MRInput.initFromEventInternal(MRInput.java:703)
>         at 
> org.apache.tez.mapreduce.input.MRInput.initFromEvent(MRInput.java:662)
>         at 
> org.apache.tez.mapreduce.input.MRInputLegacy.checkAndAwaitRecordReaderInitialization(MRInputLegacy.java:150)
>         at 
> org.apache.tez.mapreduce.input.MRInputLegacy.init(MRInputLegacy.java:114)
>         at 
> org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.getMRInput(MapRecordProcessor.java:543)
>         at 
> org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.init(MapRecordProcessor.java:189)
>         at 
> org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:268)
>         at 
> org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:252)
>         at 
> org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:374)
>         at 
> org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:75)
>         at 
> org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:62)
>         at java.security.AccessController.doPrivileged(Native Method)
>         at javax.security.auth.Subject.doAs(Subject.java:422)
>         at 
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1898)
>         at 
> org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:62)
>         at 
> org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:38)
>         at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
>         at 
> com.google.common.util.concurrent.TrustedListenableFutureTask$TrustedFutureInterruptibleTask.runInterruptibly(TrustedListenableFutureTask.java:125)
>         at 
> com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:69)
>         at 
> com.google.common.util.concurrent.TrustedListenableFutureTask.run(TrustedListenableFutureTask.java:78)
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>         at java.lang.Thread.run(Thread.java:748) {code}
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to