[ https://issues.apache.org/jira/browse/IGNITE-14756?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Mirza Aliev updated IGNITE-14756: --------------------------------- Description: Sometimes we get following NPE {code:java} SEVERE: Failed to notify configuration listener. java.util.concurrent.CompletionException: java.lang.NullPointerException at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:314) at java.base/java.util.concurrent.CompletableFuture.uniComposeStage(CompletableFuture.java:1113) at java.base/java.util.concurrent.CompletableFuture.thenCompose(CompletableFuture.java:2235) at org.apache.ignite.internal.schema.SchemaManager.initSchemaForTable(SchemaManager.java:173) at org.apache.ignite.internal.table.distributed.TableManager.lambda$listenForTableChange$0(TableManager.java:221) at java.base/java.util.concurrent.CompletableFuture$UniCompose.tryFire(CompletableFuture.java:1072) at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506) at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073) at reactor.core.publisher.MonoToCompletableFuture.onNext(MonoToCompletableFuture.java:63) at reactor.core.publisher.SerializedSubscriber.onNext(SerializedSubscriber.java:99) at reactor.core.publisher.FluxTimeout$TimeoutMainSubscriber.onNext(FluxTimeout.java:174) at reactor.core.publisher.MonoNext$NextSubscriber.onNext(MonoNext.java:76) at reactor.core.publisher.FluxFilterFuseable$FilterFuseableSubscriber.onNext(FluxFilterFuseable.java:112) at reactor.core.publisher.FluxFilterFuseable$FilterFuseableConditionalSubscriber.onNext(FluxFilterFuseable.java:330) at reactor.core.publisher.FluxOnBackpressureBuffer$BackpressureBufferSubscriber.drainRegular(FluxOnBackpressureBuffer.java:261) at reactor.core.publisher.FluxOnBackpressureBuffer$BackpressureBufferSubscriber.drain(FluxOnBackpressureBuffer.java:225) at reactor.core.publisher.FluxOnBackpressureBuffer$BackpressureBufferSubscriber.onNext(FluxOnBackpressureBuffer.java:184) at reactor.core.publisher.DirectProcessor$DirectInner.onNext(DirectProcessor.java:333) at reactor.core.publisher.DirectProcessor.onNext(DirectProcessor.java:142) at reactor.core.publisher.FluxCreate$IgnoreSink.next(FluxCreate.java:618) at reactor.core.publisher.FluxCreate$SerializedSink.next(FluxCreate.java:153) at org.apache.ignite.network.scalecube.ScaleCubeDirectMarshallerTransport.onMessage(ScaleCubeDirectMarshallerTransport.java:166) at org.apache.ignite.network.internal.netty.ConnectionManager.lambda$onMessage$2(ConnectionManager.java:140) at java.base/java.util.concurrent.CopyOnWriteArrayList.forEach(CopyOnWriteArrayList.java:803) at org.apache.ignite.network.internal.netty.ConnectionManager.onMessage(ConnectionManager.java:140) at org.apache.ignite.network.internal.netty.MessageHandler.channelRead(MessageHandler.java:46) at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:324) at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:296) at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410) at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919) at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166) at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:719) at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:655) at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:581) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) at java.base/java.lang.Thread.run(Thread.java:834) Caused by: java.lang.NullPointerException at org.apache.ignite.internal.schema.SchemaManager.createSchemaDescriptor(SchemaManager.java:234) at org.apache.ignite.internal.schema.SchemaManager.lambda$initSchemaForTable$1(SchemaManager.java:184) at java.base/java.util.concurrent.CompletableFuture.uniComposeStage(CompletableFuture.java:1106) ... 45 more {code} Please check: - that we notify ConfigurationManager before SchemaManager; - that configuration changes are made from within notification thread;ё UPD: The root cause is the data race in {{org.apache.ignite.configuration.internal.ConfigurationNode#refreshValue}} Previously in the method we had this code {code:java} synchronized (this) { if (cachedRootNode == oldRootNode) { cachedRootNode = newRootNode; // 1 beforeRefreshValue(newVal); // 2 return val = newVal; } else { if (invalid) throw noSuchElementException(); return val; } } {code} It was possible that between 1 and 2 we could read old {{NamedListView<VIEW>}} for {{newRootNode}} The fix is to cache the new RootNode after we update {{NamedListView<VIEW> }} : {code:java} synchronized (this) { if (cachedRootNode == oldRootNode) { beforeRefreshValue(newVal); val = newVal; cachedRootNode = newRootNode; return newVal; } else { if (invalid) throw noSuchElementException(); return val; } } {code} was: Sometimes we get following NPE {code:java} SEVERE: Failed to notify configuration listener. java.util.concurrent.CompletionException: java.lang.NullPointerException at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:314) at java.base/java.util.concurrent.CompletableFuture.uniComposeStage(CompletableFuture.java:1113) at java.base/java.util.concurrent.CompletableFuture.thenCompose(CompletableFuture.java:2235) at org.apache.ignite.internal.schema.SchemaManager.initSchemaForTable(SchemaManager.java:173) at org.apache.ignite.internal.table.distributed.TableManager.lambda$listenForTableChange$0(TableManager.java:221) at java.base/java.util.concurrent.CompletableFuture$UniCompose.tryFire(CompletableFuture.java:1072) at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506) at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073) at reactor.core.publisher.MonoToCompletableFuture.onNext(MonoToCompletableFuture.java:63) at reactor.core.publisher.SerializedSubscriber.onNext(SerializedSubscriber.java:99) at reactor.core.publisher.FluxTimeout$TimeoutMainSubscriber.onNext(FluxTimeout.java:174) at reactor.core.publisher.MonoNext$NextSubscriber.onNext(MonoNext.java:76) at reactor.core.publisher.FluxFilterFuseable$FilterFuseableSubscriber.onNext(FluxFilterFuseable.java:112) at reactor.core.publisher.FluxFilterFuseable$FilterFuseableConditionalSubscriber.onNext(FluxFilterFuseable.java:330) at reactor.core.publisher.FluxOnBackpressureBuffer$BackpressureBufferSubscriber.drainRegular(FluxOnBackpressureBuffer.java:261) at reactor.core.publisher.FluxOnBackpressureBuffer$BackpressureBufferSubscriber.drain(FluxOnBackpressureBuffer.java:225) at reactor.core.publisher.FluxOnBackpressureBuffer$BackpressureBufferSubscriber.onNext(FluxOnBackpressureBuffer.java:184) at reactor.core.publisher.DirectProcessor$DirectInner.onNext(DirectProcessor.java:333) at reactor.core.publisher.DirectProcessor.onNext(DirectProcessor.java:142) at reactor.core.publisher.FluxCreate$IgnoreSink.next(FluxCreate.java:618) at reactor.core.publisher.FluxCreate$SerializedSink.next(FluxCreate.java:153) at org.apache.ignite.network.scalecube.ScaleCubeDirectMarshallerTransport.onMessage(ScaleCubeDirectMarshallerTransport.java:166) at org.apache.ignite.network.internal.netty.ConnectionManager.lambda$onMessage$2(ConnectionManager.java:140) at java.base/java.util.concurrent.CopyOnWriteArrayList.forEach(CopyOnWriteArrayList.java:803) at org.apache.ignite.network.internal.netty.ConnectionManager.onMessage(ConnectionManager.java:140) at org.apache.ignite.network.internal.netty.MessageHandler.channelRead(MessageHandler.java:46) at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:324) at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:296) at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410) at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919) at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166) at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:719) at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:655) at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:581) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) at java.base/java.lang.Thread.run(Thread.java:834) Caused by: java.lang.NullPointerException at org.apache.ignite.internal.schema.SchemaManager.createSchemaDescriptor(SchemaManager.java:234) at org.apache.ignite.internal.schema.SchemaManager.lambda$initSchemaForTable$1(SchemaManager.java:184) at java.base/java.util.concurrent.CompletableFuture.uniComposeStage(CompletableFuture.java:1106) ... 45 more {code} Please check: - that we notify ConfigurationManager before SchemaManager; - that configuration changes are made from within notification thread;ё UPD: The root cause is the data race in {{org.apache.ignite.configuration.internal.ConfigurationNode#refreshValue}} Previously in the method we had this code {code:java} synchronized (this) { if (cachedRootNode == oldRootNode) { cachedRootNode = newRootNode; // 1 beforeRefreshValue(newVal); // 2 return val = newVal; } else { if (invalid) throw noSuchElementException(); return val; } } {code} It was possible that between 1 and 2 we could read old {{NamedListView<VIEW>}} for {{newRootNode}} The fix is to cache the new RootNode after we update {{NamedListView<VIEW> }} : {code:java} synchronized (this) { if (cachedRootNode == oldRootNode) { beforeRefreshValue(newVal); val = newVal; cachedRootNode = newRootNode; return newVal; } else { if (invalid) throw noSuchElementException(); return val; } } {code} > NPE on reading tableConfiguration from ConfigurationManager within > SchemaManager > -------------------------------------------------------------------------------- > > Key: IGNITE-14756 > URL: https://issues.apache.org/jira/browse/IGNITE-14756 > Project: Ignite > Issue Type: Bug > Reporter: Alexander Lapin > Assignee: Mirza Aliev > Priority: Blocker > Labels: ignite-3 > Fix For: 3.0.0-alpha2 > > > Sometimes we get following NPE > > {code:java} > SEVERE: Failed to notify configuration listener. > java.util.concurrent.CompletionException: java.lang.NullPointerException > at > java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:314) > at > java.base/java.util.concurrent.CompletableFuture.uniComposeStage(CompletableFuture.java:1113) > at > java.base/java.util.concurrent.CompletableFuture.thenCompose(CompletableFuture.java:2235) > at > org.apache.ignite.internal.schema.SchemaManager.initSchemaForTable(SchemaManager.java:173) > at > org.apache.ignite.internal.table.distributed.TableManager.lambda$listenForTableChange$0(TableManager.java:221) > at > java.base/java.util.concurrent.CompletableFuture$UniCompose.tryFire(CompletableFuture.java:1072) > at > java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506) > at > java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073) > at > reactor.core.publisher.MonoToCompletableFuture.onNext(MonoToCompletableFuture.java:63) > at > reactor.core.publisher.SerializedSubscriber.onNext(SerializedSubscriber.java:99) > at > reactor.core.publisher.FluxTimeout$TimeoutMainSubscriber.onNext(FluxTimeout.java:174) > at > reactor.core.publisher.MonoNext$NextSubscriber.onNext(MonoNext.java:76) > at > reactor.core.publisher.FluxFilterFuseable$FilterFuseableSubscriber.onNext(FluxFilterFuseable.java:112) > at > reactor.core.publisher.FluxFilterFuseable$FilterFuseableConditionalSubscriber.onNext(FluxFilterFuseable.java:330) > at > reactor.core.publisher.FluxOnBackpressureBuffer$BackpressureBufferSubscriber.drainRegular(FluxOnBackpressureBuffer.java:261) > at > reactor.core.publisher.FluxOnBackpressureBuffer$BackpressureBufferSubscriber.drain(FluxOnBackpressureBuffer.java:225) > at > reactor.core.publisher.FluxOnBackpressureBuffer$BackpressureBufferSubscriber.onNext(FluxOnBackpressureBuffer.java:184) > at > reactor.core.publisher.DirectProcessor$DirectInner.onNext(DirectProcessor.java:333) > at > reactor.core.publisher.DirectProcessor.onNext(DirectProcessor.java:142) > at > reactor.core.publisher.FluxCreate$IgnoreSink.next(FluxCreate.java:618) > at > reactor.core.publisher.FluxCreate$SerializedSink.next(FluxCreate.java:153) > at > org.apache.ignite.network.scalecube.ScaleCubeDirectMarshallerTransport.onMessage(ScaleCubeDirectMarshallerTransport.java:166) > at > org.apache.ignite.network.internal.netty.ConnectionManager.lambda$onMessage$2(ConnectionManager.java:140) > at > java.base/java.util.concurrent.CopyOnWriteArrayList.forEach(CopyOnWriteArrayList.java:803) > at > org.apache.ignite.network.internal.netty.ConnectionManager.onMessage(ConnectionManager.java:140) > at > org.apache.ignite.network.internal.netty.MessageHandler.channelRead(MessageHandler.java:46) > at > io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) > at > io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) > at > io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) > at > io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:324) > at > io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:296) > at > io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) > at > io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) > at > io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) > at > io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410) > at > io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) > at > io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) > at > io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919) > at > io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166) > at > io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:719) > at > io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:655) > at > io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:581) > at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) > at > io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) > at > io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) > at > io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) > at java.base/java.lang.Thread.run(Thread.java:834) > Caused by: java.lang.NullPointerException > at > org.apache.ignite.internal.schema.SchemaManager.createSchemaDescriptor(SchemaManager.java:234) > at > org.apache.ignite.internal.schema.SchemaManager.lambda$initSchemaForTable$1(SchemaManager.java:184) > at > java.base/java.util.concurrent.CompletableFuture.uniComposeStage(CompletableFuture.java:1106) > ... 45 more > {code} > Please check: > - that we notify ConfigurationManager before SchemaManager; > - that configuration changes are made from within notification thread;ё > UPD: > The root cause is the data race in > {{org.apache.ignite.configuration.internal.ConfigurationNode#refreshValue}} > Previously in the method we had this code > {code:java} > synchronized (this) { > if (cachedRootNode == oldRootNode) { > cachedRootNode = newRootNode; // 1 > beforeRefreshValue(newVal); // 2 > return val = newVal; > } > else { > if (invalid) > throw noSuchElementException(); > return val; > } > } > {code} > It was possible that between 1 and 2 we could read old > {{NamedListView<VIEW>}} for {{newRootNode}} > The fix is to cache the new RootNode after we update {{NamedListView<VIEW> }} > : > {code:java} > synchronized (this) { > if (cachedRootNode == oldRootNode) { > beforeRefreshValue(newVal); > val = newVal; > cachedRootNode = newRootNode; > return newVal; > } > else { > if (invalid) > throw noSuchElementException(); > return val; > } > } > {code} -- This message was sent by Atlassian Jira (v8.3.4#803005)