[ https://issues.apache.org/jira/browse/IGNITE-12774?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Alexey Goncharuk updated IGNITE-12774: -------------------------------------- Ignite Flags: (was: Release Notes Required) > Transaction hangs after too many open files NIO exception > --------------------------------------------------------- > > Key: IGNITE-12774 > URL: https://issues.apache.org/jira/browse/IGNITE-12774 > Project: Ignite > Issue Type: Bug > Reporter: Sergey Antonov > Assignee: Sergey Antonov > Priority: Major > Fix For: 2.9 > > Time Spent: 20m > Remaining Estimate: 0h > > Transaction hung after “Open too many files” error and never been finished. > {code:java} > import java.net.SocketException; > import java.util.concurrent.atomic.AtomicBoolean; > import org.apache.ignite.cluster.ClusterNode; > import org.apache.ignite.configuration.CacheConfiguration; > import org.apache.ignite.configuration.IgniteConfiguration; > import org.apache.ignite.failure.StopNodeOrHaltFailureHandler; > import org.apache.ignite.internal.IgniteEx; > import org.apache.ignite.lang.IgniteInClosure; > import org.apache.ignite.plugin.extensions.communication.Message; > import org.apache.ignite.spi.IgniteSpiException; > import org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi; > import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest; > import org.apache.ignite.transactions.Transaction; > import org.apache.ignite.transactions.TransactionConcurrency; > import org.apache.ignite.transactions.TransactionIsolation; > import static org.apache.ignite.cache.CacheAtomicityMode.TRANSACTIONAL; > import static org.apache.ignite.cache.CacheMode.PARTITIONED; > public class TooManyOpenFilesTest extends GridCommonAbstractTest { > @Override protected IgniteConfiguration getConfiguration(String > igniteInstanceName) throws Exception { > return super.getConfiguration(igniteInstanceName) > .setFailureHandler(new StopNodeOrHaltFailureHandler()) > .setCommunicationSpi(new TooManyOpenFilesTcpCommunicationSpi()) > .setConsistentId(igniteInstanceName); > } > @Override protected void beforeTest() throws Exception { > super.beforeTest(); > stopAllGrids(); > cleanPersistenceDir(); > } > @Override protected void afterTest() throws Exception { > stopAllGrids(); > cleanPersistenceDir(); > super.afterTest(); > } > public void test() throws Exception { > IgniteEx crd = startGrids(3); > crd.cluster().active(true); > crd.getOrCreateCache(new > CacheConfiguration<>().setName(DEFAULT_CACHE_NAME).setAtomicityMode(TRANSACTIONAL).setBackups(1).setCacheMode(PARTITIONED)); > TooManyOpenFilesTcpCommunicationSpi spi = > (TooManyOpenFilesTcpCommunicationSpi)grid(2).context().config().getCommunicationSpi(); > try (Transaction tx = > grid(1).transactions().txStart(TransactionConcurrency.PESSIMISTIC, > TransactionIsolation.REPEATABLE_READ)) { > IgniteCache<Object, Object> cache = > grid(1).cache(DEFAULT_CACHE_NAME); > cache.put(1, 1); > spi.throwException.set(true); > cache.put(2, 2); > cache.put(3, 2); > cache.put(4, 2); > // hungs here. > tx.commit(); > } > for (int i=0; i < 3 ; i++) { > assertEquals(1, grid(i).cache(DEFAULT_CACHE_NAME).get(1)); > assertEquals(2, grid(i).cache(DEFAULT_CACHE_NAME).get(2)); > } > } > private static class TooManyOpenFilesTcpCommunicationSpi extends > TcpCommunicationSpi { > private final AtomicBoolean throwException = new AtomicBoolean(); > /** {@inheritDoc} */ > @Override public void sendMessage(ClusterNode node, Message msg) > throws IgniteSpiException { > if (throwException.get()) > throw getException(node); > super.sendMessage(node, msg); > } > /** {@inheritDoc} */ > @Override public void sendMessage( > ClusterNode node, > Message msg, > IgniteInClosure<IgniteException> ackC > ) throws IgniteSpiException { > if (throwException.get()) > throw getException(node); > super.sendMessage(node, msg, ackC); > } > private IgniteSpiException getException(ClusterNode node) { > String checkedExceptionMsg = "Failed to connect to node (is node > still alive?). " + > "Make sure that each ComputeTask and cache Transaction has a > timeout set " + > "in order to prevent parties from waiting forever in case of > network issues " + > "[nodeId=" + node.id() + ", addrs=null]"; > return new IgniteSpiException("Failed to send message to remote > node: " + node.id(), new IgniteCheckedException(checkedExceptionMsg, new > SocketException("Too many open files"))); > } > } > } > {code} -- This message was sent by Atlassian Jira (v8.3.4#803005)