[
https://issues.apache.org/jira/browse/HDDS-7264?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Soumitra Sulav updated HDDS-7264:
---------------------------------
Priority: Critical (was: Major)
> [ozone-LR] FileSystemException Too many open files with fault injections
> ------------------------------------------------------------------------
>
> Key: HDDS-7264
> URL: https://issues.apache.org/jira/browse/HDDS-7264
> Project: Apache Ozone
> Issue Type: Bug
> Components: SCM
> Affects Versions: 1.2.0
> Reporter: Jyotirmoy Sinha
> Priority: Critical
>
> Steps :
> # Execute hive-write operation iteration on long running setup.
> # Inject the following faults in the system (one at a time with an interval
> of 60 sec) :
> a. datanode disk, process and network level faults
> b. scm disk, process and network level faults
> c. om process and network level faults
> Exceptions observed :
> {code:java}
> 2022-09-19 18:16:32,292 [ERROR] [TezChild] |scm.XceiverClientGrpc|: Failed to
> execute command ReadChunk on the pipeline Pipeline[ Id:
> caca4e29-9659-4d54-b2d8-85bdceffaca9, Nodes:
> 35447513-2e57-472d-bfb4-4b85b00bdbda{ip: 10.17.207.24, host:
> vc0114.halxg.cloudera.com, ports: [REPLICATION=9886, RATIS=9858,
> RATIS_ADMIN=9857, RATIS_SERVER=9856, STANDALONE=9859], networkLocation:
> /default, certSerialId: null, persistedOpState: IN_SERVICE,
> persistedOpStateExpiryEpochSec: 0}31b30082-3c55-459b-b0a8-636d335f2ae1{ip:
> 10.17.207.16, host: vc0106.halxg.cloudera.com, ports: [REPLICATION=9886,
> RATIS=9858, RATIS_ADMIN=9857, RATIS_SERVER=9856, STANDALONE=9859],
> networkLocation: /default, certSerialId: null, persistedOpState: IN_SERVICE,
> persistedOpStateExpiryEpochSec: 0}fefca023-0424-4d7d-85b7-d051b92772a1{ip:
> 10.17.207.17, host: vc0107.halxg.cloudera.com, ports: [REPLICATION=9886,
> RATIS=9858, RATIS_ADMIN=9857, RATIS_SERVER=9856, STANDALONE=9859],
> networkLocation: /default, certSerialId: null, persistedOpState: IN_SERVICE,
> persistedOpStateExpiryEpochSec: 0}, ReplicationConfig: STANDALONE/THREE,
> State:ALLOCATED, leaderId:,
> CreationTimestamp2022-09-19T18:16:32.097-07:00[America/Los_Angeles]].
> 2022-09-19 18:16:32,292 [INFO] [TezChild] |storage.BlockInputStream|: Unable
> to read information for block conID: 51001 locID: 109611004723385779 bcsId:
> 12436 from pipeline PipelineID=caca4e29-9659-4d54-b2d8-85bdceffaca9:
> java.nio.file.FileSystemException:
> /mnt/data/2/hadoop-ozone/datanode/data/hdds/CID-3bb0e46d-0182-4ca0-8730-278ab71bf767/current/containerDir99/51001/chunks/109611004723385779.block:
> Too many open files
> 2022-09-19 18:16:32,303 [WARN] [TezChild] |storage.BlockInputStream|: No new
> pipeline for block conID: 51001 locID: 109611004723385779 bcsId: 12436
> 2022-09-19 18:16:32,314 [INFO] [TezChild] |tez.ObjectCache|: Releasing key:
> hive_20220919181423_afc82272-207c-4be6-ac63-5cb2dd98282b_Map 1__MAP_PLAN__
> {code}
> {code:java}
> 2022-09-19 18:16:32,315 [ERROR] [TezChild] |tez.TezProcessor|:
> java.lang.RuntimeException: java.io.IOException:
> org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException:
> java.nio.file.FileSystemException:
> /mnt/data/2/hadoop-ozone/datanode/data/hdds/CID-3bb0e46d-0182-4ca0-8730-278ab71bf767/current/containerDir99/51001/chunks/109611004723385779.block:
> Too many open files
> at
> org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.initNextRecordReader(TezGroupedSplitsInputFormat.java:206)
> at
> org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.<init>(TezGroupedSplitsInputFormat.java:145)
> at
> org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat.getRecordReader(TezGroupedSplitsInputFormat.java:111)
> at
> org.apache.tez.mapreduce.lib.MRReaderMapred.setupOldRecordReader(MRReaderMapred.java:156)
> at
> org.apache.tez.mapreduce.lib.MRReaderMapred.setSplit(MRReaderMapred.java:82)
> at
> org.apache.tez.mapreduce.input.MRInput.initFromEventInternal(MRInput.java:703)
> at
> org.apache.tez.mapreduce.input.MRInput.initFromEvent(MRInput.java:662)
> at
> org.apache.tez.mapreduce.input.MRInputLegacy.checkAndAwaitRecordReaderInitialization(MRInputLegacy.java:150)
> at
> org.apache.tez.mapreduce.input.MRInputLegacy.init(MRInputLegacy.java:114)
> at
> org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.getMRInput(MapRecordProcessor.java:543)
> at
> org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.init(MapRecordProcessor.java:189)
> at
> org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:268)
> at
> org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:252)
> at
> org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:374)
> at
> org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:75)
> at
> org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:62)
> at java.security.AccessController.doPrivileged(Native Method)
> at javax.security.auth.Subject.doAs(Subject.java:422)
> at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1898)
> at
> org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:62)
> at
> org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:38)
> at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
> at
> com.google.common.util.concurrent.TrustedListenableFutureTask$TrustedFutureInterruptibleTask.runInterruptibly(TrustedListenableFutureTask.java:125)
> at
> com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:69)
> at
> com.google.common.util.concurrent.TrustedListenableFutureTask.run(TrustedListenableFutureTask.java:78)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: java.io.IOException:
> org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException:
> java.nio.file.FileSystemException:
> /mnt/data/2/hadoop-ozone/datanode/data/hdds/CID-3bb0e46d-0182-4ca0-8730-278ab71bf767/current/containerDir99/51001/chunks/109611004723385779.block:
> Too many open files
> at
> org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderCreationException(HiveIOExceptionHandlerChain.java:97)
> at
> org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(HiveIOExceptionHandlerUtil.java:57)
> at
> org.apache.hadoop.hive.ql.io.HiveInputFormat.getRecordReader(HiveInputFormat.java:434)
> at
> org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.initNextRecordReader(TezGroupedSplitsInputFormat.java:203)
> ... 27 more {code}
> {code:java}
> 2022-09-19 18:16:31,864 [ERROR] [TezChild] |tez.TezProcessor|:
> java.lang.RuntimeException: java.io.IOException: java.io.IOException:
> Unexpected OzoneException: java.io.IOException:
> java.util.concurrent.ExecutionException:
> org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException:
> DEADLINE_EXCEEDED: ClientCall was cancelled at or after deadline.
> [closed=[CANCELLED], committed=[remote_addr=10.17.207.17/10.17.207.17:9859]]
> at
> org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.initNextRecordReader(TezGroupedSplitsInputFormat.java:206)
> at
> org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.<init>(TezGroupedSplitsInputFormat.java:145)
> at
> org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat.getRecordReader(TezGroupedSplitsInputFormat.java:111)
> at
> org.apache.tez.mapreduce.lib.MRReaderMapred.setupOldRecordReader(MRReaderMapred.java:156)
> at
> org.apache.tez.mapreduce.lib.MRReaderMapred.setSplit(MRReaderMapred.java:82)
> at
> org.apache.tez.mapreduce.input.MRInput.initFromEventInternal(MRInput.java:703)
> at
> org.apache.tez.mapreduce.input.MRInput.initFromEvent(MRInput.java:662)
> at
> org.apache.tez.mapreduce.input.MRInputLegacy.checkAndAwaitRecordReaderInitialization(MRInputLegacy.java:150)
> at
> org.apache.tez.mapreduce.input.MRInputLegacy.init(MRInputLegacy.java:114)
> at
> org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.getMRInput(MapRecordProcessor.java:543)
> at
> org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.init(MapRecordProcessor.java:189)
> at
> org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:268)
> at
> org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:252)
> at
> org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:374)
> at
> org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:75)
> at
> org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:62)
> at java.security.AccessController.doPrivileged(Native Method)
> at javax.security.auth.Subject.doAs(Subject.java:422)
> at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1898)
> at
> org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:62)
> at
> org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:38)
> at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
> at
> com.google.common.util.concurrent.TrustedListenableFutureTask$TrustedFutureInterruptibleTask.runInterruptibly(TrustedListenableFutureTask.java:125)
> at
> com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:69)
> at
> com.google.common.util.concurrent.TrustedListenableFutureTask.run(TrustedListenableFutureTask.java:78)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748) {code}
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]