Github user vanzin commented on a diff in the pull request:
https://github.com/apache/spark/pull/5392#discussion_r27988469
--- Diff:
core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala ---
@@ -40,98 +37,127 @@ import org.apache.spark.util.{ActorLogReceive, Utils,
AkkaUtils}
* @param masterUrls Each url should look like spark://host:port.
*/
private[spark] class AppClient(
- actorSystem: ActorSystem,
+ rpcEnv: RpcEnv,
masterUrls: Array[String],
appDescription: ApplicationDescription,
listener: AppClientListener,
conf: SparkConf)
extends Logging {
- private val masterAkkaUrls = masterUrls.map(Master.toAkkaUrl(_,
AkkaUtils.protocol(actorSystem)))
+ private val masterRpcAddresses =
masterUrls.map(RpcAddress.fromSparkURL(_))
- private val REGISTRATION_TIMEOUT = 20.seconds
+ private val REGISTRATION_TIMEOUT_SECONDS = 20
private val REGISTRATION_RETRIES = 3
- private var masterAddress: Address = null
- private var actor: ActorRef = null
+ private var endpoint: RpcEndpointRef = null
private var appId: String = null
- private var registered = false
- private var activeMasterUrl: String = null
+ @volatile private var registered = false
+
+ private class ClientEndpoint(override val rpcEnv: RpcEnv) extends
ThreadSafeRpcEndpoint
+ with Logging {
+
+ var master: Option[RpcEndpointRef] = None
+ var alreadyDisconnected = false // To avoid calling
listener.disconnected() multiple times
+ @volatile private var alreadyDead = false // To avoid calling
listener.dead() multiple times
+ @volatile private var registerMasterFutures: Array[Future[_]] = null
+ @volatile private var registrationRetryTimer: ScheduledFuture[_] = null
+
+ private val registerMasterThreadPool = new ThreadPoolExecutor(
+ 0,
+ masterRpcAddresses.size, // Make sure we can register with all
masters at the same time
+ 60L, TimeUnit.SECONDS,
+ new SynchronousQueue[Runnable](),
+ Utils.namedThreadFactory("appclient-register-master-threadpool"))
- private class ClientActor extends Actor with ActorLogReceive with
Logging {
- var master: ActorSelection = null
- var alreadyDisconnected = false // To avoid calling
listener.disconnected() multiple times
- var alreadyDead = false // To avoid calling listener.dead() multiple
times
- var registrationRetryTimer: Option[Cancellable] = None
+ private val registrationRetryThread =
Executors.newScheduledThreadPool(1,
+ Utils.namedThreadFactory("appclient-registration-retry-thread"))
- override def preStart() {
- context.system.eventStream.subscribe(self,
classOf[RemotingLifecycleEvent])
+ override def onStart(): Unit = {
try {
- registerWithMaster()
+ registerWithMaster(1)
} catch {
case e: Exception =>
logWarning("Failed to connect to master", e)
markDisconnected()
- context.stop(self)
+ stop()
}
}
- def tryRegisterAllMasters() {
- for (masterAkkaUrl <- masterAkkaUrls) {
- logInfo("Connecting to master " + masterAkkaUrl + "...")
- val actor = context.actorSelection(masterAkkaUrl)
- actor ! RegisterApplication(appDescription)
+ private def tryRegisterAllMasters(): Array[Future[_]] = {
+ for (masterAddress <- masterRpcAddresses) yield {
+ registerMasterThreadPool.submit(new Runnable {
+ override def run(): Unit = try {
+ if (registered) {
+ return
+ }
+ logInfo("Connecting to master " + masterAddress.toSparkURL +
"...")
+ val masterRef =
+ rpcEnv.setupEndpointRef(Master.SYSTEM_NAME, masterAddress,
Master.ENDPOINT_NAME)
+ masterRef.send(RegisterApplication(appDescription, self))
+ } catch {
+ case ie: InterruptedException => // Cancelled
+ case NonFatal(e) => logError(e.getMessage, e)
+ }
+ })
}
}
- def registerWithMaster() {
- tryRegisterAllMasters()
- import context.dispatcher
- var retries = 0
- registrationRetryTimer = Some {
- context.system.scheduler.schedule(REGISTRATION_TIMEOUT,
REGISTRATION_TIMEOUT) {
+ /**
+ * nthRetry means this is the nth attempt to register with master
+ */
+ private def registerWithMaster(nthRetry: Int) {
--- End diff --
Ah, I see. So all you need is a single `RegisteredApplication` reply to
exit this loop. That's fine.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]