[
https://issues.apache.org/jira/browse/SPARK-44547?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Frank Yin updated SPARK-44547:
------------------------------
Description:
Looks like the RDD cache doesn't support fallback storage and we should stop
the migration if the only viable peer is the fallback storage.
[^spark-error.log] 23/07/25 05:12:58 WARN BlockManager: Failed to replicate
rdd_18_25 to BlockManagerId(fallback, remote, 7337, None), failure #0
java.io.IOException: Failed to connect to remote:7337
at
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:288)
at
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
at
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:230)
at
org.apache.spark.network.netty.NettyBlockTransferService.uploadBlock(NettyBlockTransferService.scala:168)
at
org.apache.spark.network.BlockTransferService.uploadBlockSync(BlockTransferService.scala:121)
at
org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$replicate(BlockManager.scala:1784)
at
org.apache.spark.storage.BlockManager.$anonfun$replicateBlock$2(BlockManager.scala:1721)
at
org.apache.spark.storage.BlockManager.$anonfun$replicateBlock$2$adapted(BlockManager.scala:1707)
at scala.Option.forall(Option.scala:390)
at
org.apache.spark.storage.BlockManager.replicateBlock(BlockManager.scala:1707)
at
org.apache.spark.storage.BlockManagerDecommissioner.migrateBlock(BlockManagerDecommissioner.scala:356)
at
org.apache.spark.storage.BlockManagerDecommissioner.$anonfun$decommissionRddCacheBlocks$3(BlockManagerDecommissioner.scala:340)
at
scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
at
scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at
scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at scala.collection.TraversableLike.map(TraversableLike.scala:286)
at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at
org.apache.spark.storage.BlockManagerDecommissioner.decommissionRddCacheBlocks(BlockManagerDecommissioner.scala:339)
at
org.apache.spark.storage.BlockManagerDecommissioner$$anon$1.run(BlockManagerDecommissioner.scala:214)
at
java.base/java.util.concurrent.Executors$RunnableAdapter.call(Unknown Source)
at java.base/java.util.concurrent.FutureTask.run(Unknown Source)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown
Source)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown
Source)
at java.base/java.lang.Thread.run(Unknown Source)
Caused by: java.net.UnknownHostException: remote
at java.base/java.net.InetAddress$CachedAddresses.get(Unknown Source)
at java.base/java.net.InetAddress.getAllByName0(Unknown Source)
at java.base/java.net.InetAddress.getAllByName(Unknown Source)
at java.base/java.net.InetAddress.getAllByName(Unknown Source)
at java.base/java.net.InetAddress.getByName(Unknown Source)
at io.netty.util.internal.SocketUtils$8.run(SocketUtils.java:156)
at io.netty.util.internal.SocketUtils$8.run(SocketUtils.java:153)
at java.base/java.security.AccessController.doPrivileged(Native Method)
at
io.netty.util.internal.SocketUtils.addressByName(SocketUtils.java:153)
at
io.netty.resolver.DefaultNameResolver.doResolve(DefaultNameResolver.java:41)
at
io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:61)
at
io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:53)
at
io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:55)
at
io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:31)
at
io.netty.resolver.AbstractAddressResolver.resolve(AbstractAddressResolver.java:106)
at io.netty.bootstrap.Bootstrap.doResolveAndConnect0(Bootstrap.java:206)
at io.netty.bootstrap.Bootstrap.access$000(Bootstrap.java:46)
at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:180)
at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:166)
at
io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:578)
at
io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:552)
at
io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:491)
at
io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:616)
at
io.netty.util.concurrent.DefaultPromise.setSuccess0(DefaultPromise.java:605)
at
io.netty.util.concurrent.DefaultPromise.trySuccess(DefaultPromise.java:104)
at
io.netty.channel.DefaultChannelPromise.trySuccess(DefaultChannelPromise.java:84)
at
io.netty.channel.AbstractChannel$AbstractUnsafe.safeSetSuccess(AbstractChannel.java:990)
at
io.netty.channel.AbstractChannel$AbstractUnsafe.register0(AbstractChannel.java:516)
at
io.netty.channel.AbstractChannel$AbstractUnsafe.access$200(AbstractChannel.java:429)
at
io.netty.channel.AbstractChannel$AbstractUnsafe$1.run(AbstractChannel.java:486)
at
io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:164)
at
io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:469)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:503)
at
io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986)
at
io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at
io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
was:
Looks like the RDD cache doesn't support fallback storage and we should stop
the migration if the only viable peer is the fallback storage.
{{23/07/25 05:12:58 WARN BlockManager: Failed to replicate rdd_18_25 to
BlockManagerId(fallback, remote, 7337, None), failure #0
java.io.IOException: Failed to connect to remote:7337
at
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:288)
at
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
at
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:230)
at
org.apache.spark.network.netty.NettyBlockTransferService.uploadBlock(NettyBlockTransferService.scala:168)
at
org.apache.spark.network.BlockTransferService.uploadBlockSync(BlockTransferService.scala:121)
at
org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$replicate(BlockManager.scala:1784)
at
org.apache.spark.storage.BlockManager.$anonfun$replicateBlock$2(BlockManager.scala:1721)
at
org.apache.spark.storage.BlockManager.$anonfun$replicateBlock$2$adapted(BlockManager.scala:1707)
at scala.Option.forall(Option.scala:390)
at
org.apache.spark.storage.BlockManager.replicateBlock(BlockManager.scala:1707)
at
org.apache.spark.storage.BlockManagerDecommissioner.migrateBlock(BlockManagerDecommissioner.scala:356)
at
org.apache.spark.storage.BlockManagerDecommissioner.$anonfun$decommissionRddCacheBlocks$3(BlockManagerDecommissioner.scala:340)
at
scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
at
scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at
scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at scala.collection.TraversableLike.map(TraversableLike.scala:286)
at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at
org.apache.spark.storage.BlockManagerDecommissioner.decommissionRddCacheBlocks(BlockManagerDecommissioner.scala:339)
at
org.apache.spark.storage.BlockManagerDecommissioner$$anon$1.run(BlockManagerDecommissioner.scala:214)
at
java.base/java.util.concurrent.Executors$RunnableAdapter.call(Unknown Source)
at java.base/java.util.concurrent.FutureTask.run(Unknown Source)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown
Source)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown
Source)
at java.base/java.lang.Thread.run(Unknown Source)
Caused by: java.net.UnknownHostException: remote
at java.base/java.net.InetAddress$CachedAddresses.get(Unknown Source)
at java.base/java.net.InetAddress.getAllByName0(Unknown Source)
at java.base/java.net.InetAddress.getAllByName(Unknown Source)
at java.base/java.net.InetAddress.getAllByName(Unknown Source)
at java.base/java.net.InetAddress.getByName(Unknown Source)
at io.netty.util.internal.SocketUtils$8.run(SocketUtils.java:156)
at io.netty.util.internal.SocketUtils$8.run(SocketUtils.java:153)
at java.base/java.security.AccessController.doPrivileged(Native Method)
at
io.netty.util.internal.SocketUtils.addressByName(SocketUtils.java:153)
at
io.netty.resolver.DefaultNameResolver.doResolve(DefaultNameResolver.java:41)
at
io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:61)
at
io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:53)
at
io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:55)
at
io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:31)
at
io.netty.resolver.AbstractAddressResolver.resolve(AbstractAddressResolver.java:106)
at io.netty.bootstrap.Bootstrap.doResolveAndConnect0(Bootstrap.java:206)
at io.netty.bootstrap.Bootstrap.access$000(Bootstrap.java:46)
at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:180)
at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:166)
at
io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:578)
at
io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:552)
at
io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:491)
at
io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:616)
at
io.netty.util.concurrent.DefaultPromise.setSuccess0(DefaultPromise.java:605)
at
io.netty.util.concurrent.DefaultPromise.trySuccess(DefaultPromise.java:104)
at
io.netty.channel.DefaultChannelPromise.trySuccess(DefaultChannelPromise.java:84)
at
io.netty.channel.AbstractChannel$AbstractUnsafe.safeSetSuccess(AbstractChannel.java:990)
at
io.netty.channel.AbstractChannel$AbstractUnsafe.register0(AbstractChannel.java:516)
at
io.netty.channel.AbstractChannel$AbstractUnsafe.access$200(AbstractChannel.java:429)
at
io.netty.channel.AbstractChannel$AbstractUnsafe$1.run(AbstractChannel.java:486)
at
io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:164)
at
io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:469)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:503)
at
io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986)
at
io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at
io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)}}
> BlockManagerDecommissioner throws exceptions when migrating RDD cached blocks
> to fallback storage
> -------------------------------------------------------------------------------------------------
>
> Key: SPARK-44547
> URL: https://issues.apache.org/jira/browse/SPARK-44547
> Project: Spark
> Issue Type: Bug
> Components: Spark Core
> Affects Versions: 3.4.1
> Reporter: Frank Yin
> Priority: Major
> Attachments: spark-error.log
>
>
> Looks like the RDD cache doesn't support fallback storage and we should stop
> the migration if the only viable peer is the fallback storage.
> [^spark-error.log] 23/07/25 05:12:58 WARN BlockManager: Failed to replicate
> rdd_18_25 to BlockManagerId(fallback, remote, 7337, None), failure #0
> java.io.IOException: Failed to connect to remote:7337
> at
> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:288)
> at
> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
> at
> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:230)
> at
> org.apache.spark.network.netty.NettyBlockTransferService.uploadBlock(NettyBlockTransferService.scala:168)
> at
> org.apache.spark.network.BlockTransferService.uploadBlockSync(BlockTransferService.scala:121)
> at
> org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$replicate(BlockManager.scala:1784)
> at
> org.apache.spark.storage.BlockManager.$anonfun$replicateBlock$2(BlockManager.scala:1721)
> at
> org.apache.spark.storage.BlockManager.$anonfun$replicateBlock$2$adapted(BlockManager.scala:1707)
> at scala.Option.forall(Option.scala:390)
> at
> org.apache.spark.storage.BlockManager.replicateBlock(BlockManager.scala:1707)
> at
> org.apache.spark.storage.BlockManagerDecommissioner.migrateBlock(BlockManagerDecommissioner.scala:356)
> at
> org.apache.spark.storage.BlockManagerDecommissioner.$anonfun$decommissionRddCacheBlocks$3(BlockManagerDecommissioner.scala:340)
> at
> scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
> at
> scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
> at
> scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
> at scala.collection.TraversableLike.map(TraversableLike.scala:286)
> at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
> at scala.collection.AbstractTraversable.map(Traversable.scala:108)
> at
> org.apache.spark.storage.BlockManagerDecommissioner.decommissionRddCacheBlocks(BlockManagerDecommissioner.scala:339)
> at
> org.apache.spark.storage.BlockManagerDecommissioner$$anon$1.run(BlockManagerDecommissioner.scala:214)
> at
> java.base/java.util.concurrent.Executors$RunnableAdapter.call(Unknown Source)
> at java.base/java.util.concurrent.FutureTask.run(Unknown Source)
> at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown
> Source)
> at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown
> Source)
> at java.base/java.lang.Thread.run(Unknown Source)
> Caused by: java.net.UnknownHostException: remote
> at java.base/java.net.InetAddress$CachedAddresses.get(Unknown Source)
> at java.base/java.net.InetAddress.getAllByName0(Unknown Source)
> at java.base/java.net.InetAddress.getAllByName(Unknown Source)
> at java.base/java.net.InetAddress.getAllByName(Unknown Source)
> at java.base/java.net.InetAddress.getByName(Unknown Source)
> at io.netty.util.internal.SocketUtils$8.run(SocketUtils.java:156)
> at io.netty.util.internal.SocketUtils$8.run(SocketUtils.java:153)
> at java.base/java.security.AccessController.doPrivileged(Native Method)
> at
> io.netty.util.internal.SocketUtils.addressByName(SocketUtils.java:153)
> at
> io.netty.resolver.DefaultNameResolver.doResolve(DefaultNameResolver.java:41)
> at
> io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:61)
> at
> io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:53)
> at
> io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:55)
> at
> io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:31)
> at
> io.netty.resolver.AbstractAddressResolver.resolve(AbstractAddressResolver.java:106)
> at io.netty.bootstrap.Bootstrap.doResolveAndConnect0(Bootstrap.java:206)
> at io.netty.bootstrap.Bootstrap.access$000(Bootstrap.java:46)
> at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:180)
> at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:166)
> at
> io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:578)
> at
> io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:552)
> at
> io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:491)
> at
> io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:616)
> at
> io.netty.util.concurrent.DefaultPromise.setSuccess0(DefaultPromise.java:605)
> at
> io.netty.util.concurrent.DefaultPromise.trySuccess(DefaultPromise.java:104)
> at
> io.netty.channel.DefaultChannelPromise.trySuccess(DefaultChannelPromise.java:84)
> at
> io.netty.channel.AbstractChannel$AbstractUnsafe.safeSetSuccess(AbstractChannel.java:990)
> at
> io.netty.channel.AbstractChannel$AbstractUnsafe.register0(AbstractChannel.java:516)
> at
> io.netty.channel.AbstractChannel$AbstractUnsafe.access$200(AbstractChannel.java:429)
> at
> io.netty.channel.AbstractChannel$AbstractUnsafe$1.run(AbstractChannel.java:486)
> at
> io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:164)
> at
> io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:469)
> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:503)
> at
> io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986)
> at
> io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
> at
> io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]