[GitHub] spark pull request: [SPARK-5338][MESOS] Add cluster mode support f...

andrewor14 Wed, 25 Mar 2015 15:13:20 -0700

Github user andrewor14 commented on a diff in the pull request:

    https://github.com/apache/spark/pull/5144#discussion_r27171790
  
    --- Diff: 
core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
 ---
    @@ -0,0 +1,552 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.scheduler.cluster.mesos
    +
    +import java.io.File
    +import java.text.SimpleDateFormat
    +import java.util.concurrent.atomic.AtomicLong
    +import java.util.concurrent.locks.ReentrantLock
    +import java.util.{Date, List => JList}
    +
    +import org.apache.mesos.{SchedulerDriver, Scheduler}
    +import org.apache.mesos.Protos._
    +import org.apache.spark.deploy.master.DriverState
    +import org.apache.spark.deploy.master.DriverState.DriverState
    +import org.apache.spark.SparkConf
    +import org.apache.spark.SparkException
    +import org.apache.spark.util.Utils
    +
    +import scala.collection.mutable
    +import scala.collection.mutable.ArrayBuffer
    +import scala.collection.JavaConversions._
    +
    +import scala.concurrent.duration.Duration
    +import org.apache.mesos.Protos.Environment.Variable
    +import org.apache.spark.deploy.mesos.MesosDriverDescription
    +import org.apache.mesos.Protos.TaskStatus.Reason
    +
    +private[spark] class DriverSubmission(
    +    val submissionId: String,
    +    val desc: MesosDriverDescription,
    +    val submitDate: Date) extends Serializable {
    +
    +  def canEqual(other: Any): Boolean = other.isInstanceOf[DriverSubmission]
    +
    +  override def equals(other: Any): Boolean = other match {
    +    case that: DriverSubmission =>
    +      (that canEqual this) &&
    +        submissionId == that.submissionId
    +    case _ => false
    +  }
    +}
    +
    +private [spark] case class ClusterTaskState(
    +    val submission: DriverSubmission,
    +    val taskId: TaskID,
    +    val slaveId: SlaveID,
    +    var taskState: Option[TaskStatus],
    +    var driverState: DriverState,
    +    var startDate: Date,
    +    val lastRetry: Option[RetryState] = None) extends Serializable {
    +
    +  def copy(): ClusterTaskState = {
    +    ClusterTaskState(submission, taskId, slaveId, taskState, driverState, 
startDate, lastRetry)
    +  }
    +}
    +
    +private[spark] case class SubmitResponse(id: String, success: Boolean, 
message: String)
    +
    +private[spark] case class StatusResponse(
    +    id: String,
    +    success: Boolean,
    +    state: String,
    +    status: Option[TaskStatus] = None)
    +
    +private[spark] case class KillResponse(id: String, success: Boolean, 
message: String)
    +
    +private[spark] case class ClusterSchedulerState(
    +    appId: String,
    +    queuedDrivers: Iterable[DriverSubmission],
    +    launchedDrivers: Iterable[ClusterTaskState],
    +    finishedDrivers: Iterable[ClusterTaskState],
    +    retryList: Iterable[RetryState])
    +
    +private[spark] trait ClusterScheduler {
    +  def submitDriver(desc: MesosDriverDescription): SubmitResponse
    +
    +  def killDriver(submissionId: String): KillResponse
    +
    +  def getStatus(submissionId: String): StatusResponse
    +
    +  def getState(): ClusterSchedulerState
    +}
    +
    +private[spark] class MesosClusterScheduler(
    +    engineFactory: ClusterPersistenceEngineFactory,
    +    conf: SparkConf) extends Scheduler with MesosSchedulerHelper with 
ClusterScheduler {
    +
    +  var frameworkUrl: String = _
    +
    +  val master = conf.get("spark.master")
    +  val appName = conf.get("spark.app.name")
    +  val queuedCapacity = conf.getInt("spark.deploy.mesos.queuedDrivers", 200)
    +  val retainedDrivers = conf.getInt("spark.deploy.retainedDrivers", 200)
    +  val maxRetryWaitTime = conf.getInt("spark.mesos.cluster.retry.wait.max", 
60) // 1 minute
    +  val state = engineFactory.createEngine("scheduler")
    +  val stateTimeout =
    +    Duration.create(conf.getLong("spark.mesos.cluster.recover.timeout", 
30), "seconds")
    +
    +  val stateLock = new ReentrantLock()
    +
    +  val finishedDrivers = new 
mutable.ArrayBuffer[ClusterTaskState](retainedDrivers)
    +
    +  val nextDriverNumber: AtomicLong = new AtomicLong(0)
    +  var appId: String = null
    +
    +  private var launchedDrivers: LaunchedDrivers = _
    +
    +  private var queue: DriverQueue = _
    +
    +  def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")  // For 
application IDs
    +
    +  private var superviseRetryList: SuperviseRetryList = _
    +
    +  private def newDriverId(submitDate: Date): String = {
    +    "driver-%s-%04d".format(
    +        createDateFormat.format(submitDate), 
nextDriverNumber.incrementAndGet())
    +  }
    +
    +  def submitDriver(desc: MesosDriverDescription): SubmitResponse = {
    +    stateLock.synchronized {
    +      if (queue.isFull) {
    +        return SubmitResponse("", false, "Already reached maximum 
submission size")
    +      }
    +
    +      val submitDate: Date = new Date()
    +      val submissionId: String = newDriverId(submitDate)
    +      val submission = new DriverSubmission(submissionId, desc, submitDate)
    +      queue.offer(submission)
    +      SubmitResponse(submissionId, true, "")
    +    }
    +  }
    +
    +  def killDriver(submissionId: String): KillResponse = {
    +    stateLock.synchronized {
    +      // We look for the requested driver in the following places:
    +      // 1. Check if submission is running or launched.
    +      // 2. Check if it's still queued.
    +      // 3. Check if it's in the retry list.
    +      if (launchedDrivers.contains(submissionId)) {
    +        val task = launchedDrivers.get(submissionId)
    +        driver.killTask(task.taskId)
    +        return KillResponse(submissionId, true, "Killing running driver")
    +      } else if (queue.remove(submissionId)) {
    +        return KillResponse(submissionId, true, "Removed driver while it's 
still pending")
    +      } else if (superviseRetryList.remove(submissionId)) {
    +        return KillResponse(submissionId, true, "Removed driver while it's 
retrying")
    +      } else {
    +        return KillResponse(submissionId, false, "Cannot find driver")
    +      }
    +    }
    +  }
    +
    +  def recoverState {
    +    stateLock.synchronized {
    +      queue = new DriverQueue(engineFactory.createEngine("driverQueue"), 
queuedCapacity)
    +
    +      launchedDrivers = new 
LaunchedDrivers(engineFactory.createEngine("launchedDrivers"))
    +
    +      // There is potential timing issue where a queued driver might have 
been launched
    +      // but the scheduler shuts down before the queued driver was able to 
be removed
    +      // from the queue. We try to mitigate this issue by walking through 
all queued drivers
    +      // and remove if they're already launched.
    +      queue.drivers.foreach {
    +        d => if (launchedDrivers.contains(d.submissionId)) {
    +          queue.remove(d.submissionId)
    +        }
    +      }
    +
    +      superviseRetryList = new 
SuperviseRetryList(engineFactory.createEngine("retryList"))
    +
    +      // TODO: Consider storing finished drivers so we can show them on 
the UI after
    +      // failover. For now we clear the history on each recovery.
    +      finishedDrivers.clear()
    +    }
    +  }
    +
    +  def start() {
    +    // TODO: Implement leader election to make sure only one framework 
running in the cluster.
    +    val fwId = state.fetch[String]("frameworkId")
    +
    +    val builder = FrameworkInfo.newBuilder()
    +      .setUser(Utils.getCurrentUserName())
    +      .setName(appName)
    +      .setWebuiUrl(frameworkUrl)
    +      .setCheckpoint(true)
    +      .setFailoverTimeout(Integer.MAX_VALUE) // Setting to max for tasks 
keep running until recovery
    +
    +    fwId.foreach { id =>
    +      builder.setId(FrameworkID.newBuilder().setValue(id).build())
    +      appId = id
    +    }
    +
    +    // Recover scheduler state that is persisted.
    +    // We still need to do task reconciliation to be up to date of the 
latest task states
    +    // as it might have changed while the scheduler is failing over.
    +    recoverState
    +    startScheduler("MesosClusterScheduler", master, 
MesosClusterScheduler.this, builder.build())
    +  }
    +
    +  def stop() {
    +    driver.stop(true)
    +  }
    +
    +  override def registered(
    +      driver: SchedulerDriver,
    +      frameworkId: FrameworkID,
    +      masterInfo: MasterInfo): Unit = {
    +    logInfo("Registered as framework ID " + frameworkId.getValue)
    +    if (frameworkId.getValue != appId) {
    +      appId = frameworkId.getValue
    +      state.persist("frameworkId", appId)
    +    }
    +    markRegistered()
    +
    +    stateLock.synchronized {
    +      if (!launchedDrivers.pendingRecover.isEmpty) {
    +        // Start task reconciliation if we need to recover.
    +        val statuses = launchedDrivers.pendingRecover.collect {
    +          case (taskId, slaveId) =>
    +            launchedDrivers.get(taskId).taskState.getOrElse(
    +              TaskStatus.newBuilder()
    +                .setTaskId(TaskID.newBuilder().setValue(taskId).build())
    +                .setSlaveId(slaveId)
    +                .setState(TaskState.TASK_STAGING)
    +                .build)
    +        }
    +
    +        // TODO: Page the status updates to avoid trying to reconcile
    +        // a large amount of tasks at once.
    +        driver.reconcileTasks(statuses)
    +      }
    +    }
    +  }
    +
    +  private def buildCommand(req: DriverSubmission): CommandInfo = {
    +    val desc = req.desc
    +
    +    val appJar = CommandInfo.URI.newBuilder()
    +      
.setValue(desc.desc.jarUrl.stripPrefix("file:").stripPrefix("local:")).build()
    +
    +    val builder = CommandInfo.newBuilder()
    +      .addUris(appJar)
    +
    +    val entries =
    +      (conf.getOption("spark.executor.extraLibraryPath").toList ++
    +        desc.desc.command.libraryPathEntries)
    +
    +    val prefixEnv = if (!entries.isEmpty) {
    +      Utils.libraryPathEnvPrefix(entries)
    +    } else {
    +      ""
    +    }
    +
    +    val envBuilder = Environment.newBuilder()
    +    desc.desc.command.environment.foreach {
    +      case (k, v) =>
    +        envBuilder.addVariables(
    +          Variable.newBuilder().setName(k).setValue(v).build())
    +    }
    +
    +    builder.setEnvironment(envBuilder.build())
    +
    +    val cmdOptions = generateCmdOption(req)
    +
    +    val executorUri = 
req.desc.schedulerProperties.get("spark.executor.uri")
    +      .orElse(req.desc.desc.command.environment.get("SPARK_EXECUTOR_URI"))
    +
    +    val cmd = if (executorUri.isDefined) {
    +      
builder.addUris(CommandInfo.URI.newBuilder().setValue(executorUri.get).build())
    +
    +      val folderBasename = executorUri.get.split('/').last.split('.').head
    +
    +      val cmdExecutable = s"cd $folderBasename*; $prefixEnv 
bin/spark-submit"
    +
    +      val cmdJar = s"../${desc.desc.jarUrl.split("/").last}"
    +
    +      val appArguments = desc.desc.command.arguments.mkString(" ")
    +
    +      s"$cmdExecutable ${cmdOptions.mkString(" ")} $cmdJar $appArguments"
    +    } else {
    +      val executorSparkHome = 
req.desc.schedulerProperties.get("spark.mesos.executor.home")
    +        .orElse(conf.getOption("spark.home"))
    +        .orElse(Option(System.getenv("SPARK_HOME")))
    +        .getOrElse {
    +          throw new SparkException("Executor Spark home 
`spark.mesos.executor.home` is not set!")
    +        }
    +
    +      val cmdExecutable = new File(executorSparkHome, 
"./bin/spark-submit").getCanonicalPath
    +
    +      val cmdJar = desc.desc.jarUrl.split("/").last
    +
    +      s"$cmdExecutable ${cmdOptions.mkString(" ")} $cmdJar"
    +    }
    +
    +    builder.setValue(cmd)
    +
    +    builder.build
    +  }
    +
    +  private def generateCmdOption(req: DriverSubmission): Seq[String] = {
    +    var options = Seq(
    +        "--name", req.desc.schedulerProperties("spark.app.name"),
    +        "--class", req.desc.desc.command.mainClass,
    +        "--master", s"mesos://${conf.get("spark.master")}",
    +        "--driver-cores", req.desc.desc.cores.toString,
    +        "--driver-memory", s"${req.desc.desc.mem}M")
    +
    +    req.desc.schedulerProperties.get("spark.executor.memory").map { v =>
    +      options ++= Seq("--executor-memory", v)
    +    }
    +
    +    req.desc.schedulerProperties.get("spark.cores.max").map { v =>
    +      options ++= Seq("--total-executor-cores", v)
    +    }
    +
    +    options
    +  }
    +
    +  private [spark] case class ResourceOffer(val offer: Offer, var cpu: 
Double, var mem: Double)
    --- End diff --
    
    why is this `private[spark]`? This is only used in this file. Also, case 
classes by definition should be immutable. If it takes in a var then it should 
prob not be a case class.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-5338][MESOS] Add cluster mode support f...

Reply via email to