Github user zsxwing commented on a diff in the pull request:
https://github.com/apache/spark/pull/5392#discussion_r27969371
--- Diff:
core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala ---
@@ -40,98 +37,127 @@ import org.apache.spark.util.{ActorLogReceive, Utils,
AkkaUtils}
* @param masterUrls Each url should look like spark://host:port.
*/
private[spark] class AppClient(
- actorSystem: ActorSystem,
+ rpcEnv: RpcEnv,
masterUrls: Array[String],
appDescription: ApplicationDescription,
listener: AppClientListener,
conf: SparkConf)
extends Logging {
- private val masterAkkaUrls = masterUrls.map(Master.toAkkaUrl(_,
AkkaUtils.protocol(actorSystem)))
+ private val masterRpcAddresses =
masterUrls.map(RpcAddress.fromSparkURL(_))
- private val REGISTRATION_TIMEOUT = 20.seconds
+ private val REGISTRATION_TIMEOUT_SECONDS = 20
private val REGISTRATION_RETRIES = 3
- private var masterAddress: Address = null
- private var actor: ActorRef = null
+ private var endpoint: RpcEndpointRef = null
private var appId: String = null
- private var registered = false
- private var activeMasterUrl: String = null
+ @volatile private var registered = false
+
+ private class ClientEndpoint(override val rpcEnv: RpcEnv) extends
ThreadSafeRpcEndpoint
+ with Logging {
+
+ var master: Option[RpcEndpointRef] = None
+ var alreadyDisconnected = false // To avoid calling
listener.disconnected() multiple times
+ @volatile private var alreadyDead = false // To avoid calling
listener.dead() multiple times
+ @volatile private var registerMasterFutures: Array[Future[_]] = null
+ @volatile private var registrationRetryTimer: ScheduledFuture[_] = null
+
+ private val registerMasterThreadPool = new ThreadPoolExecutor(
+ 0,
+ masterRpcAddresses.size, // Make sure we can register with all
masters at the same time
+ 60L, TimeUnit.SECONDS,
+ new SynchronousQueue[Runnable](),
+ Utils.namedThreadFactory("appclient-register-master-threadpool"))
- private class ClientActor extends Actor with ActorLogReceive with
Logging {
- var master: ActorSelection = null
- var alreadyDisconnected = false // To avoid calling
listener.disconnected() multiple times
- var alreadyDead = false // To avoid calling listener.dead() multiple
times
- var registrationRetryTimer: Option[Cancellable] = None
+ private val registrationRetryThread =
Executors.newScheduledThreadPool(1,
+ Utils.namedThreadFactory("appclient-registration-retry-thread"))
- override def preStart() {
- context.system.eventStream.subscribe(self,
classOf[RemotingLifecycleEvent])
+ override def onStart(): Unit = {
try {
- registerWithMaster()
+ registerWithMaster(1)
} catch {
case e: Exception =>
logWarning("Failed to connect to master", e)
markDisconnected()
- context.stop(self)
+ stop()
}
}
- def tryRegisterAllMasters() {
- for (masterAkkaUrl <- masterAkkaUrls) {
- logInfo("Connecting to master " + masterAkkaUrl + "...")
- val actor = context.actorSelection(masterAkkaUrl)
- actor ! RegisterApplication(appDescription)
+ private def tryRegisterAllMasters(): Array[Future[_]] = {
+ for (masterAddress <- masterRpcAddresses) yield {
+ registerMasterThreadPool.submit(new Runnable {
+ override def run(): Unit = try {
+ if (registered) {
+ return
+ }
+ logInfo("Connecting to master " + masterAddress.toSparkURL +
"...")
+ val masterRef =
+ rpcEnv.setupEndpointRef(Master.SYSTEM_NAME, masterAddress,
Master.ENDPOINT_NAME)
+ masterRef.send(RegisterApplication(appDescription, self))
+ } catch {
+ case ie: InterruptedException => // Cancelled
+ case NonFatal(e) => logError(e.getMessage, e)
+ }
+ })
}
}
- def registerWithMaster() {
- tryRegisterAllMasters()
- import context.dispatcher
- var retries = 0
- registrationRetryTimer = Some {
- context.system.scheduler.schedule(REGISTRATION_TIMEOUT,
REGISTRATION_TIMEOUT) {
+ /**
+ * nthRetry means this is the nth attempt to register with master
+ */
+ private def registerWithMaster(nthRetry: Int) {
+ registerMasterFutures = tryRegisterAllMasters()
+ registrationRetryTimer =
registrationRetryThread.scheduleAtFixedRate(new Runnable {
+ override def run(): Unit = {
Utils.tryOrExit {
- retries += 1
if (registered) {
- registrationRetryTimer.foreach(_.cancel())
- } else if (retries >= REGISTRATION_RETRIES) {
+ registerMasterFutures.foreach(_.cancel(true))
+ registerMasterThreadPool.shutdownNow()
+ } else if (nthRetry >= REGISTRATION_RETRIES) {
markDead("All masters are unresponsive! Giving up.")
} else {
- tryRegisterAllMasters()
+ registerMasterFutures.foreach(_.cancel(true))
+ registerWithMaster(nthRetry + 1)
}
}
}
- }
+ }, REGISTRATION_TIMEOUT_SECONDS, REGISTRATION_TIMEOUT_SECONDS,
TimeUnit.SECONDS)
}
- def changeMaster(url: String) {
- // activeMasterUrl is a valid Spark url since we receive it from
master.
- activeMasterUrl = url
- master = context.actorSelection(
- Master.toAkkaUrl(activeMasterUrl, AkkaUtils.protocol(actorSystem)))
- masterAddress = Master.toAkkaAddress(activeMasterUrl,
AkkaUtils.protocol(actorSystem))
+ private def sendToMaster(message: Any): Unit = {
+ master match {
+ case Some(masterRef) => masterRef.send(message)
+ case None => logWarning(s"Drop $message because has not yet
connected to master")
+ }
}
- private def isPossibleMaster(remoteUrl: Address) = {
-
masterAkkaUrls.map(AddressFromURIString(_).hostPort).contains(remoteUrl.hostPort)
+ private def isPossibleMaster(remoteAddress: RpcAddress): Boolean = {
+ masterRpcAddresses.map(_.hostPort).contains(remoteAddress.hostPort)
}
- override def receiveWithLogging: PartialFunction[Any, Unit] = {
- case RegisteredApplication(appId_, masterUrl) =>
+ override def receive: PartialFunction[Any, Unit] = {
+ case RegisteredApplication(appId_, masterRef) =>
+ // FIXME How to handle the following cases?
--- End diff --
> So, this is the kind of thing that sendWithReply was meant to do. Can
that be used here instead?
Both RegisterApplication and RegisteredApplication are sent using `send`.
It's not the `ask` pattern.
My comment here is about sending `RegisterApplication` multiple times in
`registerWithMaster`.
1. In extreme case, `RegisterApplication` may need
`REGISTRATION_TIMEOUT_SECONDS` to arrive at Master. Then we will send more than
one `RegisterApplication`s to the Master. However, this looks the user's duty.
They should increase `REGISTRATION_TIMEOUT_SECONDS`.
2. Considering the following order
```
a. AppClient sends `RegisterApplication` to Master A and standby Master
B.
b. Master A receives RegisterApplication, and sends
`RegisteredApplication` back.
c. Master A crashes.
d. Master B starts to recovery.
e. `RegisterApplication` arrives at Master B, and Master B sends
`RegisteredApplication` back.
f. `RegisterApplication`from Master B arrives at AppClient
g. `RegisterApplication`from Master A arrives at AppClient
```
Because this rarely happens, it may be not a big deal.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]