This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 23e3626  [SPARK-35002][YARN][TESTS][FOLLOW-UP] Fix 
java.net.BindException in MiniYARNCluster
23e3626 is described below

commit 23e36266213edf736c6eb049e153dfe2e11728fb
Author: HyukjinKwon <gurwls...@apache.org>
AuthorDate: Wed Apr 14 17:13:48 2021 +0800

    [SPARK-35002][YARN][TESTS][FOLLOW-UP] Fix java.net.BindException in 
MiniYARNCluster
    
    This PR fixes two tests below:
    
    https://github.com/apache/spark/runs/2320161984
    
    ```
    [info] YarnShuffleIntegrationSuite:
    [info] org.apache.spark.deploy.yarn.YarnShuffleIntegrationSuite *** ABORTED 
*** (228 milliseconds)
    [info]   org.apache.hadoop.yarn.exceptions.YarnRuntimeException: 
org.apache.hadoop.yarn.webapp.WebAppException: Error starting http server
    [info]   at 
org.apache.hadoop.yarn.server.MiniYARNCluster.startResourceManager(MiniYARNCluster.java:373)
    [info]   at 
org.apache.hadoop.yarn.server.MiniYARNCluster.access$300(MiniYARNCluster.java:128)
    [info]   at 
org.apache.hadoop.yarn.server.MiniYARNCluster$ResourceManagerWrapper.serviceStart(MiniYARNCluster.java:503)
    [info]   at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
    [info]   at 
org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:121)
    [info]   at 
org.apache.hadoop.yarn.server.MiniYARNCluster.serviceStart(MiniYARNCluster.java:322)
    [info]   at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
    [info]   at 
org.apache.spark.deploy.yarn.BaseYarnClusterSuite.beforeAll(BaseYarnClusterSuite.scala:95)
    ...
    [info]   Cause: java.net.BindException: Port in use: fv-az186-831:0
    [info]   at 
org.apache.hadoop.http.HttpServer2.constructBindException(HttpServer2.java:1231)
    [info]   at 
org.apache.hadoop.http.HttpServer2.bindForSinglePort(HttpServer2.java:1253)
    [info]   at 
org.apache.hadoop.http.HttpServer2.openListeners(HttpServer2.java:1316)
    [info]   at org.apache.hadoop.http.HttpServer2.start(HttpServer2.java:1167)
    [info]   at 
org.apache.hadoop.yarn.webapp.WebApps$Builder.start(WebApps.java:449)
    [info]   at 
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.startWepApp(ResourceManager.java:1247)
    [info]   at 
org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.serviceStart(ResourceManager.java:1356)
    [info]   at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
    [info]   at 
org.apache.hadoop.yarn.server.MiniYARNCluster.startResourceManager(MiniYARNCluster.java:365)
    [info]   at 
org.apache.hadoop.yarn.server.MiniYARNCluster.access$300(MiniYARNCluster.java:128)
    [info]   at 
org.apache.hadoop.yarn.server.MiniYARNCluster$ResourceManagerWrapper.serviceStart(MiniYARNCluster.java:503)
    [info]   at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
    [info]   at 
org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:121)
    [info]   at 
org.apache.hadoop.yarn.server.MiniYARNCluster.serviceStart(MiniYARNCluster.java:322)
    [info]   at 
org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
    [info]   at 
org.apache.spark.deploy.yarn.BaseYarnClusterSuite.beforeAll(BaseYarnClusterSuite.scala:95)
    [info]   at 
org.scalatest.BeforeAndAfterAll.liftedTree1$1(BeforeAndAfterAll.scala:212)
    [info]   at org.scalatest.BeforeAndAfterAll.run(BeforeAndAfterAll.scala:210)
    [info]   at 
org.scalatest.BeforeAndAfterAll.run$(BeforeAndAfterAll.scala:208)
    [info]   at org.apache.spark.SparkFunSuite.run(SparkFunSuite.scala:61)
    ...
    ```
    
    https://github.com/apache/spark/runs/2323342094
    
    ```
    [info] Test 
org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadSecret 
started
    [error] Test 
org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadSecret 
failed: java.lang.AssertionError: Connecting to /10.1.0.161:39895 timed out 
(120000 ms), took 120.081 sec
    [error]     at 
org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadSecret(ExternalShuffleSecuritySuite.java:85)
    [error]     ...
    [info] Test 
org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadAppId 
started
    [error] Test 
org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadAppId 
failed: java.lang.AssertionError: Connecting to /10.1.0.198:44633 timed out 
(120000 ms), took 120.08 sec
    [error]     at 
org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadAppId(ExternalShuffleSecuritySuite.java:76)
    [error]     ...
    [info] Test 
org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testValid started
    [error] Test 
org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testValid failed: 
java.io.IOException: Connecting to /10.1.0.119:43575 timed out (120000 ms), 
took 120.089 sec
    [error]     at 
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:285)
    [error]     at 
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
    [error]     at 
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:230)
    [error]     at 
org.apache.spark.network.shuffle.ExternalBlockStoreClient.registerWithShuffleServer(ExternalBlockStoreClient.java:211)
    [error]     at 
org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.validate(ExternalShuffleSecuritySuite.java:108)
    [error]     at 
org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testValid(ExternalShuffleSecuritySuite.java:68)
    [error]     ...
    [info] Test 
org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testEncryption 
started
    [error] Test 
org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testEncryption 
failed: java.io.IOException: Connecting to /10.1.0.248:35271 timed out (120000 
ms), took 120.014 sec
    [error]     at 
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:285)
    [error]     at 
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
    [error]     at 
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:230)
    [error]     at 
org.apache.spark.network.shuffle.ExternalBlockStoreClient.registerWithShuffleServer(ExternalBlockStoreClient.java:211)
    [error]     at 
org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.validate(ExternalShuffleSecuritySuite.java:108)
    [error]     at 
org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testEncryption(ExternalShu
    ```
    
    For Yarn cluster suites, its difficult to fix. This PR makes it skipped if 
it fails to bind.
    For shuffle related suites, it uses local host
    
    To make the tests stable
    
    No, dev-only.
    
    Its tested in GitHub Actions: 
https://github.com/HyukjinKwon/spark/runs/2340210765
    
    Closes #32126 from HyukjinKwon/SPARK-35002-followup.
    
    Authored-by: HyukjinKwon <gurwls...@apache.org>
    Signed-off-by: Yuming Wang <yumw...@ebay.com>
    (cherry picked from commit a153efa643dcb1d8e6c2242846b3db0b2be39ae7)
    Signed-off-by: HyukjinKwon <gurwls...@apache.org>
---
 .../java/org/apache/spark/network/TestUtils.java   |  4 +++-
 .../spark/deploy/yarn/BaseYarnClusterSuite.scala   | 27 ++++++++++++++++++----
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git 
a/common/network-common/src/test/java/org/apache/spark/network/TestUtils.java 
b/common/network-common/src/test/java/org/apache/spark/network/TestUtils.java
index 56a2b80..c2c5ffa 100644
--- 
a/common/network-common/src/test/java/org/apache/spark/network/TestUtils.java
+++ 
b/common/network-common/src/test/java/org/apache/spark/network/TestUtils.java
@@ -22,7 +22,9 @@ import java.net.InetAddress;
 public class TestUtils {
   public static String getLocalHost() {
     try {
-      return InetAddress.getLocalHost().getHostAddress();
+      return (System.getenv().containsKey("SPARK_LOCAL_IP"))?
+        System.getenv("SPARK_LOCAL_IP"):
+          InetAddress.getLocalHost().getHostAddress();
     } catch (Exception e) {
       throw new RuntimeException(e);
     }
diff --git 
a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
 
b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
index 20f5339..2542b45 100644
--- 
a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
+++ 
b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
@@ -28,7 +28,8 @@ import scala.concurrent.duration._
 import com.google.common.io.Files
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.server.MiniYARNCluster
-import org.scalatest.BeforeAndAfterAll
+import org.scalactic.source.Position
+import org.scalatest.{BeforeAndAfterAll, Tag}
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.matchers.must.Matchers
 
@@ -41,6 +42,7 @@ import org.apache.spark.util.Utils
 
 abstract class BaseYarnClusterSuite
   extends SparkFunSuite with BeforeAndAfterAll with Matchers with Logging {
+  private var isBindSuccessful = true
 
   // log4j configuration for the YARN containers, so that their output is 
collected
   // by YARN instead of trying to overwrite unit-tests.log.
@@ -64,6 +66,14 @@ abstract class BaseYarnClusterSuite
 
   def newYarnConfig(): YarnConfiguration
 
+  override protected def test(testName: String, testTags: Tag*)(testFun: => 
Any)
+                             (implicit pos: Position): Unit = {
+    super.test(testName, testTags: _*) {
+      assume(isBindSuccessful, "Mini Yarn cluster should be able to bind.")
+      testFun
+    }
+  }
+
   override def beforeAll(): Unit = {
     super.beforeAll()
 
@@ -80,9 +90,16 @@ abstract class BaseYarnClusterSuite
     
yarnConf.set("yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage",
       "100.0")
 
-    yarnCluster = new MiniYARNCluster(getClass().getName(), 1, 1, 1)
-    yarnCluster.init(yarnConf)
-    yarnCluster.start()
+    try {
+      yarnCluster = new MiniYARNCluster(getClass().getName(), 1, 1, 1)
+      yarnCluster.init(yarnConf)
+      yarnCluster.start()
+    } catch {
+      case e: Throwable if 
org.apache.commons.lang3.exception.ExceptionUtils.indexOfThrowable(
+          e, classOf[java.net.BindException]) != -1 =>
+        isBindSuccessful = false
+        return
+    }
 
     // There's a race in MiniYARNCluster in which start() may return before 
the RM has updated
     // its address in the configuration. You can see this in the logs by 
noticing that when
@@ -118,7 +135,7 @@ abstract class BaseYarnClusterSuite
 
   override def afterAll(): Unit = {
     try {
-      yarnCluster.stop()
+      if (yarnCluster != null) yarnCluster.stop()
     } finally {
       super.afterAll()
     }

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to