[
https://issues.apache.org/jira/browse/IGNITE-22319?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Nikita Amelchev updated IGNITE-22319:
-------------------------------------
Release Note: Fixed node crashing if a snapshot restore cancelled due to
network issues
> Node crashes if a snapshot restore cancelled due to network issues
> -------------------------------------------------------------------
>
> Key: IGNITE-22319
> URL: https://issues.apache.org/jira/browse/IGNITE-22319
> Project: Ignite
> Issue Type: Bug
> Affects Versions: 2.15, 2.16
> Reporter: Nikita Amelchev
> Assignee: Nikita Amelchev
> Priority: Major
> Labels: ise
> Fix For: 2.17
>
> Time Spent: 8h 20m
> Remaining Estimate: 0h
>
> Node crashes if a snapshot restore (not same topology) cancelled due to
> network issues.
> There are several possible reasons:
> 1. Assertion error (node left or socket timeout):
> {noformat}
> [2024-05-21T18:39:22,212][ERROR][disco-event-worker-#1479%snapshot.IgniteSnapshotRestoreFromRemoteTest2%][GridEventStorageManager]
> Unexpected exception in listener notification for event: DiscoveryEvent
> [evtNode=TcpDiscoveryNode [id=1b573ddc-b0eb-4909-978b-9d418c100000,
> consistentId=snapshot.IgniteSnapshotRestoreFromRemoteTest0, addrs=ArrayList
> [127.0.0.1], sockAddrs=HashSet [/127.0.0.1:47500], discPort=47500, order=1,
> intOrder=1, loc=false, ver=2.17.0#20240512-sha1:27cef45b, isClient=false],
> topVer=4, msgTemplate=null,
> span=o.a.i.i.processors.tracing.NoopSpan@48b53ef5, nodeId8=9992b67a, msg=Node
> left, type=NODE_LEFT, tstamp=1716305962203]
> java.lang.AssertionError: null
> at
> org.apache.ignite.internal.processors.cache.persistence.snapshot.IgniteSnapshotManager$SequentialRemoteSnapshotManager.onException(IgniteSnapshotManager.java:4022)
> ~[classes/:?]
> at
> org.apache.ignite.internal.managers.communication.GridIoManager.interruptReceiver(GridIoManager.java:2799)
> ~[classes/:?]
> at
> org.apache.ignite.internal.managers.communication.GridIoManager$6.onEvent(GridIoManager.java:972)
> ~[classes/:?]
> at
> org.apache.ignite.internal.managers.eventstorage.GridEventStorageManager$LocalListenerWrapper.onEvent(GridEventStorageManager.java:1403)
> ~[classes/:?]
> at
> org.apache.ignite.internal.managers.eventstorage.GridEventStorageManager.notifyListeners(GridEventStorageManager.java:898)
> [classes/:?]
> at
> org.apache.ignite.internal.managers.eventstorage.GridEventStorageManager.notifyListeners(GridEventStorageManager.java:883)
> [classes/:?]
> at
> org.apache.ignite.internal.managers.eventstorage.GridEventStorageManager.record0(GridEventStorageManager.java:354)
> [classes/:?]
> at
> org.apache.ignite.internal.managers.eventstorage.GridEventStorageManager.record(GridEventStorageManager.java:317)
> [classes/:?]
> at
> org.apache.ignite.internal.managers.discovery.GridDiscoveryManager$DiscoveryWorker.recordEvent(GridDiscoveryManager.java:3036)
> [classes/:?]
> at
> org.apache.ignite.internal.managers.discovery.GridDiscoveryManager$DiscoveryWorker.body0(GridDiscoveryManager.java:3223)
> [classes/:?]
> at
> org.apache.ignite.internal.managers.discovery.GridDiscoveryManager$DiscoveryWorker.body(GridDiscoveryManager.java:3056)
> [classes/:?]
> at org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:125)
> [classes/:?]
> at java.base/java.lang.Thread.run(Thread.java:829) [?:?]
> {noformat}
> 2. Deadlock of system threads.
> {noformat}
> ##### DEADLOCKED Thread
> [name="pub-#956%snapshot.IgniteSnapshotRestoreFromRemoteTest2%", id=1056,
> state=BLOCKED, blockCnt=10, waitCnt=0]
> Lock
> [object=o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$RemoteSnapshotFilesRecevier@4a114695,
>
> ownerName=disco-event-worker-#853%snapshot.IgniteSnapshotRestoreFromRemoteTest2%,
> ownerId=952]
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$RemoteSnapshotFilesRecevier.init(IgniteSnapshotManager.java:3698)
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$SequentialRemoteSnapshotManager.submit(IgniteSnapshotManager.java:3836)
> - locked
> o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$SequentialRemoteSnapshotManager@1da11e24
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$SequentialRemoteSnapshotManager.scheduleNext(IgniteSnapshotManager.java:3849)
> - locked
> o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$SequentialRemoteSnapshotManager@1da11e24
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$SequentialRemoteSnapshotManager$$Lambda$1787/0x00000008009c1c40.run(Unknown
> Source)
> at
> app//o.a.i.i.util.future.GridFutureAdapter.lambda$listen$8a14a590$1(GridFutureAdapter.java:363)
> at
> app//o.a.i.i.util.future.GridFutureAdapter$$Lambda$1215/0x00000008007ba040.apply(Unknown
> Source)
> at
> app//o.a.i.i.util.future.GridFutureAdapter.notifyListener(GridFutureAdapter.java:474)
> at
> app//o.a.i.i.util.future.GridFutureAdapter.unblock(GridFutureAdapter.java:350)
> at
> app//o.a.i.i.util.future.GridFutureAdapter.unblockAll(GridFutureAdapter.java:338)
> at
> app//o.a.i.i.util.future.GridFutureAdapter.onDone(GridFutureAdapter.java:586)
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$RemoteSnapshotFilesRecevier.onDone(IgniteSnapshotManager.java:3773)
> - locked
> o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$RemoteSnapshotFilesRecevier@21a97efb
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$RemoteSnapshotFilesRecevier.onDone(IgniteSnapshotManager.java:3642)
> at
> app//o.a.i.i.util.future.GridFutureAdapter.onDone(GridFutureAdapter.java:565)
> at
> app//o.a.i.i.util.future.GridFutureAdapter.onDone(GridFutureAdapter.java:553)
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$RemoteSnapshotFilesRecevier.acceptException(IgniteSnapshotManager.java:3740)
> - locked
> o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$RemoteSnapshotFilesRecevier@21a97efb
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$RemoteSnapshotFilesRecevier.acceptFile(IgniteSnapshotManager.java:3754)
> - locked
> o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$RemoteSnapshotFilesRecevier@21a97efb
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$SequentialRemoteSnapshotManager$1.accept(IgniteSnapshotManager.java:4111)
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$SequentialRemoteSnapshotManager$1.accept(IgniteSnapshotManager.java:4098)
> at
> app//o.a.i.i.managers.communication.FileReceiver.receive(FileReceiver.java:95)
> at
> app//o.a.i.i.managers.communication.GridIoManager.receiveFromChannel(GridIoManager.java:2948)
> at
> app//o.a.i.i.managers.communication.GridIoManager.processOpenedChannel(GridIoManager.java:2877)
> at
> app//o.a.i.i.managers.communication.GridIoManager$7.run(GridIoManager.java:1231)
> at
> [email protected]/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
> at
> [email protected]/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
> at [email protected]/java.lang.Thread.run(Thread.java:829)
> ##### DEADLOCKED Thread
> [name="disco-event-worker-#853%snapshot.IgniteSnapshotRestoreFromRemoteTest2%",
> id=952, state=BLOCKED, blockCnt=1, waitCnt=74]
> Lock
> [object=o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$SequentialRemoteSnapshotManager@1da11e24,
> ownerName=pub-#956%snapshot.IgniteSnapshotRestoreFromRemoteTest2%,
> ownerId=1056]
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$SequentialRemoteSnapshotManager.scheduleNext(IgniteSnapshotManager.java:3844)
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$SequentialRemoteSnapshotManager$$Lambda$1787/0x00000008009c1c40.run(Unknown
> Source)
> at
> app//o.a.i.i.util.future.GridFutureAdapter.lambda$listen$8a14a590$1(GridFutureAdapter.java:363)
> at
> app//o.a.i.i.util.future.GridFutureAdapter$$Lambda$1215/0x00000008007ba040.apply(Unknown
> Source)
> at
> app//o.a.i.i.util.future.GridFutureAdapter.notifyListener(GridFutureAdapter.java:474)
> at
> app//o.a.i.i.util.future.GridFutureAdapter.unblock(GridFutureAdapter.java:350)
> at
> app//o.a.i.i.util.future.GridFutureAdapter.unblockAll(GridFutureAdapter.java:338)
> at
> app//o.a.i.i.util.future.GridFutureAdapter.onDone(GridFutureAdapter.java:586)
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$RemoteSnapshotFilesRecevier.onDone(IgniteSnapshotManager.java:3773)
> - locked
> o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$RemoteSnapshotFilesRecevier@4a114695
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$RemoteSnapshotFilesRecevier.onDone(IgniteSnapshotManager.java:3642)
> at
> app//o.a.i.i.util.future.GridFutureAdapter.onDone(GridFutureAdapter.java:565)
> at
> app//o.a.i.i.util.future.GridFutureAdapter.onDone(GridFutureAdapter.java:553)
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$RemoteSnapshotFilesRecevier.acceptException(IgniteSnapshotManager.java:3740)
> - locked
> o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$RemoteSnapshotFilesRecevier@4a114695
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$SequentialRemoteSnapshotManager.lambda$onNodeLeft$0(IgniteSnapshotManager.java:3888)
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$SequentialRemoteSnapshotManager$$Lambda$1658/0x0000000800926840.accept(Unknown
> Source)
> at [email protected]/java.lang.Iterable.forEach(Iterable.java:75)
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$SequentialRemoteSnapshotManager.onNodeLeft(IgniteSnapshotManager.java:3886)
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager.lambda$start0$20(IgniteSnapshotManager.java:668)
> at
> app//o.a.i.i.processors.cache.persistence.snapshot.IgniteSnapshotManager$$Lambda$547/0x00000008004edc40.onEvent(Unknown
> Source)
> at
> app//o.a.i.i.managers.eventstorage.GridEventStorageManager$DiscoveryListenerWrapper.onEvent(GridEventStorageManager.java:1453)
> at
> app//o.a.i.i.managers.eventstorage.GridEventStorageManager.notifyListeners(GridEventStorageManager.java:898)
> at
> app//o.a.i.i.managers.eventstorage.GridEventStorageManager.notifyListeners(GridEventStorageManager.java:883)
> at
> app//o.a.i.i.managers.eventstorage.GridEventStorageManager.record0(GridEventStorageManager.java:354)
> at
> app//o.a.i.i.managers.eventstorage.GridEventStorageManager.record(GridEventStorageManager.java:317)
> at
> app//o.a.i.i.managers.discovery.GridDiscoveryManager$DiscoveryWorker.recordEvent(GridDiscoveryManager.java:3036)
> at
> app//o.a.i.i.managers.discovery.GridDiscoveryManager$DiscoveryWorker.body0(GridDiscoveryManager.java:3223)
> at
> app//o.a.i.i.managers.discovery.GridDiscoveryManager$DiscoveryWorker.body(GridDiscoveryManager.java:3056)
> at app//o.a.i.i.util.worker.GridWorker.run(GridWorker.java:125)
> at [email protected]/java.lang.Thread.run(Thread.java:829)
> {noformat}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)