Github user aarondav commented on a diff in the pull request:
https://github.com/apache/spark/pull/5392#discussion_r33515404
--- Diff:
core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala ---
@@ -40,98 +37,139 @@ import org.apache.spark.util.{ActorLogReceive,
RpcUtils, Utils, AkkaUtils}
* @param masterUrls Each url should look like spark://host:port.
*/
private[spark] class AppClient(
- actorSystem: ActorSystem,
+ rpcEnv: RpcEnv,
masterUrls: Array[String],
appDescription: ApplicationDescription,
listener: AppClientListener,
conf: SparkConf)
extends Logging {
- private val masterAkkaUrls = masterUrls.map(Master.toAkkaUrl(_,
AkkaUtils.protocol(actorSystem)))
+ private val masterRpcAddresses =
masterUrls.map(RpcAddress.fromSparkURL(_))
- private val REGISTRATION_TIMEOUT = 20.seconds
+ private val REGISTRATION_TIMEOUT_SECONDS = 20
private val REGISTRATION_RETRIES = 3
- private var masterAddress: Address = null
- private var actor: ActorRef = null
+ private var endpoint: RpcEndpointRef = null
private var appId: String = null
- private var registered = false
- private var activeMasterUrl: String = null
-
- private class ClientActor extends Actor with ActorLogReceive with
Logging {
- var master: ActorSelection = null
- var alreadyDisconnected = false // To avoid calling
listener.disconnected() multiple times
- var alreadyDead = false // To avoid calling listener.dead() multiple
times
- var registrationRetryTimer: Option[Cancellable] = None
-
- override def preStart() {
- context.system.eventStream.subscribe(self,
classOf[RemotingLifecycleEvent])
+ @volatile private var registered = false
+
+ private class ClientEndpoint(override val rpcEnv: RpcEnv) extends
ThreadSafeRpcEndpoint
+ with Logging {
+
+ private var master: Option[RpcEndpointRef] = None
+ // To avoid calling listener.disconnected() multiple times
+ private var alreadyDisconnected = false
+ @volatile private var alreadyDead = false // To avoid calling
listener.dead() multiple times
+ @volatile private var registerMasterFutures: Array[JFuture[_]] = null
+ @volatile private var registrationRetryTimer: JScheduledFuture[_] =
null
+
+ // A thread pool for registering with masters. Because registering
with a master is a blocking
+ // action, this thread pool must be able to create
"masterRpcAddresses.size" threads at the same
+ // time so that we can register with all masters.
+ private val registerMasterThreadPool = new ThreadPoolExecutor(
+ 0,
+ masterRpcAddresses.size, // Make sure we can register with all
masters at the same time
+ 60L, TimeUnit.SECONDS,
+ new SynchronousQueue[Runnable](),
+
ThreadUtils.namedThreadFactory("appclient-register-master-threadpool"))
+
+ // A scheduled executor for scheduling the registration actions
+ private val registrationRetryThread =
+
ThreadUtils.newDaemonSingleThreadScheduledExecutor("appclient-registration-retry-thread")
+
+ override def onStart(): Unit = {
try {
- registerWithMaster()
+ registerWithMaster(1)
} catch {
case e: Exception =>
logWarning("Failed to connect to master", e)
markDisconnected()
- context.stop(self)
+ stop()
}
}
- def tryRegisterAllMasters() {
- for (masterAkkaUrl <- masterAkkaUrls) {
- logInfo("Connecting to master " + masterAkkaUrl + "...")
- val actor = context.actorSelection(masterAkkaUrl)
- actor ! RegisterApplication(appDescription)
+ /**
+ * Register with all masters asynchronously and returns an array
`Future`s for cancellation.
+ */
+ private def tryRegisterAllMasters(): Array[JFuture[_]] = {
+ for (masterAddress <- masterRpcAddresses) yield {
+ registerMasterThreadPool.submit(new Runnable {
+ override def run(): Unit = try {
+ if (registered) {
+ return
+ }
+ logInfo("Connecting to master " + masterAddress.toSparkURL +
"...")
+ val masterRef =
+ rpcEnv.setupEndpointRef(Master.SYSTEM_NAME, masterAddress,
Master.ENDPOINT_NAME)
+ masterRef.send(RegisterApplication(appDescription, self))
+ } catch {
+ case ie: InterruptedException => // Cancelled
+ case NonFatal(e) => logError(e.getMessage, e)
+ }
+ })
}
}
- def registerWithMaster() {
- tryRegisterAllMasters()
- import context.dispatcher
- var retries = 0
- registrationRetryTimer = Some {
- context.system.scheduler.schedule(REGISTRATION_TIMEOUT,
REGISTRATION_TIMEOUT) {
+ /**
+ * Register with all masters asynchronously. It will call
`registerWithMaster` every
+ * REGISTRATION_TIMEOUT_SECONDS seconds until exceeding
REGISTRATION_RETRIES times.
+ * Once we connect to a master successfully, all scheduling work and
Futures will be cancelled.
+ *
+ * nthRetry means this is the nth attempt to register with master.
+ */
+ private def registerWithMaster(nthRetry: Int) {
+ registerMasterFutures = tryRegisterAllMasters()
+ registrationRetryTimer =
registrationRetryThread.scheduleAtFixedRate(new Runnable {
+ override def run(): Unit = {
Utils.tryOrExit {
- retries += 1
if (registered) {
- registrationRetryTimer.foreach(_.cancel())
- } else if (retries >= REGISTRATION_RETRIES) {
+ registerMasterFutures.foreach(_.cancel(true))
+ registerMasterThreadPool.shutdownNow()
+ } else if (nthRetry >= REGISTRATION_RETRIES) {
markDead("All masters are unresponsive! Giving up.")
} else {
- tryRegisterAllMasters()
+ registerMasterFutures.foreach(_.cancel(true))
+ registerWithMaster(nthRetry + 1)
}
}
}
- }
+ }, REGISTRATION_TIMEOUT_SECONDS, REGISTRATION_TIMEOUT_SECONDS,
TimeUnit.SECONDS)
}
- def changeMaster(url: String) {
- // activeMasterUrl is a valid Spark url since we receive it from
master.
- activeMasterUrl = url
- master = context.actorSelection(
- Master.toAkkaUrl(activeMasterUrl, AkkaUtils.protocol(actorSystem)))
- masterAddress = Master.toAkkaAddress(activeMasterUrl,
AkkaUtils.protocol(actorSystem))
+ private def sendToMaster(message: Any): Unit = {
--- End diff --
Maybe add doc to this method to describe the no-master case.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]