mridulm commented on code in PR #3091:
URL: https://github.com/apache/celeborn/pull/3091#discussion_r2051654996


##########
verifier/src/main/scala/org/apache/celeborn/verifier/action/Operation.scala:
##########
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.celeborn.verifier.action
+
+import java.util.concurrent.atomic.AtomicBoolean
+
+import scala.util.Random
+
+import org.apache.celeborn.common.internal.Logging
+import org.apache.celeborn.common.util.Utils
+import org.apache.celeborn.verifier.runner.RunnerContext
+import org.apache.celeborn.verifier.scheduler.SchedulerContext
+
+abstract class Operation(
+    val actionTarget: ActionTarget,
+    val updateContextBlock: RunnerContext => Unit = _ => {},
+    val interval: Long)
+  extends Serializable with Logging {
+
+  def executeOnRunner(context: RunnerContext): OperationResult
+
+  def updateSchedulerContext(context: SchedulerContext): Unit = {
+    val runContext =
+      RunnerContext(context.conf, 
context.runnerInfos.get(actionTarget.identity).resource)
+    updateContextBlock(runContext)
+  }
+
+  def executeCommand(command: String, block: => Unit): OperationResult = {
+    try {
+      val commandResult = Utils.runCommand(command)

Review Comment:
   `Utils.runCommand` is currently not handling stderr ... which can 
potentially be an issue (in case some of these commands generate a bunch of 
stderr output) - something to address.



##########
verifier/src/main/scala/org/apache/celeborn/verifier/plan/exception/PlanInvalidException.scala:
##########
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.celeborn.verifier.plan.exception
+
+/**
+ * Invalid verification plan exception.
+ * @param message The message of invalid verification plan exception.
+ */
+class PlanInvalidException(val message: String) extends Exception(message) {}

Review Comment:
   nit: `InvalidPlanExeception`



##########
verifier/src/main/scala/org/apache/celeborn/verifier/runner/RunnerArguments.scala:
##########
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.celeborn.verifier.runner
+
+import scala.annotation.tailrec
+
+import org.apache.celeborn.common.util.{IntParam, Utils}
+import org.apache.celeborn.verifier.conf.{VerifierArguments, VerifierConf}
+
+class RunnerArguments(override val args: Array[String], override val conf: 
VerifierConf)
+  extends VerifierArguments(args, conf) {
+
+  var host: String = Utils.localHostName(conf.celebornConf)
+  var port: Int = 0
+
+  @tailrec
+  final override def parseArguments(args: List[String]): Unit = {
+    args match {
+      case "-h" :: value :: tail =>
+        Utils.checkHost(value)
+        host = value
+        parseArguments(tail)
+      case "-p" :: IntParam(value) :: tail =>
+        port = value
+        parseArguments(tail)
+      case "--pf" :: value :: tail =>

Review Comment:
   `--pf` -> `--properties-file` to be consistent with similar existing usages ?



##########
verifier/src/main/scala/org/apache/celeborn/verifier/plan/VerificationPlan.scala:
##########
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.celeborn.verifier.plan
+
+import org.apache.celeborn.verifier.action.Action
+
+class VerificationPlan(val actions: List[Action], val trigger: Trigger, val 
checker: Checker)

Review Comment:
   Wondering if it makes more sense to make this a `case class`



##########
verifier/src/main/scala/org/apache/celeborn/verifier/runner/RunnerArguments.scala:
##########
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.celeborn.verifier.runner
+
+import scala.annotation.tailrec
+
+import org.apache.celeborn.common.util.{IntParam, Utils}
+import org.apache.celeborn.verifier.conf.{VerifierArguments, VerifierConf}
+
+class RunnerArguments(override val args: Array[String], override val conf: 
VerifierConf)
+  extends VerifierArguments(args, conf) {
+
+  var host: String = Utils.localHostName(conf.celebornConf)
+  var port: Int = 0
+
+  @tailrec
+  final override def parseArguments(args: List[String]): Unit = {
+    args match {
+      case "-h" :: value :: tail =>
+        Utils.checkHost(value)
+        host = value
+        parseArguments(tail)
+      case "-p" :: IntParam(value) :: tail =>

Review Comment:
   `("--port" | "-p")`
   
   Here and below also applies to `SchedulerArguments`



##########
sbin/verf-cli.sh:
##########
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash

Review Comment:
   nit: Here, and for other files - explicitly rename the script to 
`verifier-*.sh` ? (internal variables like `CELEBORN_VERF_CLI_JAVA_OPTS` is 
fine : the scripts get exposed to users).
   We dont gain much by the shortening (and it is not a standard name from what 
I can tell)



##########
verifier/src/main/scala/org/apache/celeborn/verifier/conf/VerifierConf.scala:
##########
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.celeborn.verifier.conf
+
+import java.io.File
+import java.util
+
+import org.apache.celeborn.common.CelebornConf
+import org.apache.celeborn.common.rpc.RpcAddress
+import org.apache.celeborn.common.util.Utils
+
+class VerifierConf {
+
+  val celebornConf = new CelebornConf()
+  private val settings = new util.HashMap[String, String]()
+
+  def get(key: String, defaultValue: String): String = {
+    Option(settings.get(key)).getOrElse(defaultValue)
+  }
+
+  def set(key: String, value: String): Unit = {
+    settings.put(key, value)
+  }
+}
+
+object VerifierConf {
+
+  private val defaultScriptsLocation: String =
+    sys.env.get("CELEBORN_HOME").map { t => s"$t${File.separator}sbin" }
+      .map { t => new File(s"$t") }
+      .filter(_.isFile)
+      .map(_.getAbsolutePath)
+      .orNull
+
+  def schedulerAddress(conf: VerifierConf): RpcAddress = {
+    val parts = conf.get(
+      "verf.scheduler.address",

Review Comment:
   Use `celeborn.verifier` namespace instead ?



##########
verifier/src/main/scala/org/apache/celeborn/verifier/scheduler/Scheduler.scala:
##########
@@ -0,0 +1,289 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.celeborn.verifier.scheduler
+
+import java.util.concurrent.{ConcurrentHashMap, Future, ScheduledFuture, 
TimeUnit}
+import java.util.concurrent.atomic.{AtomicBoolean, AtomicReference}
+
+import scala.collection.JavaConverters.{enumerationAsScalaIteratorConverter, 
mapAsScalaConcurrentMapConverter}
+import scala.concurrent.{ExecutionContext, ExecutionContextExecutor}
+import scala.concurrent.duration.Duration
+import scala.util.Random
+
+import org.apache.celeborn.common.internal.Logging
+import org.apache.celeborn.common.metrics.source.Role
+import org.apache.celeborn.common.protocol.TransportModuleConstants
+import org.apache.celeborn.common.protocol.message.StatusCode
+import org.apache.celeborn.common.rpc.{RpcCallContext, RpcEndpoint, RpcEnv}
+import org.apache.celeborn.common.util.ThreadUtils
+import org.apache.celeborn.verifier.action.Operation
+import org.apache.celeborn.verifier.conf.VerifierConf
+import org.apache.celeborn.verifier.info.{NodeStatus, RunnerInfo}
+import org.apache.celeborn.verifier.plan.VerificationPlan
+import org.apache.celeborn.verifier.plan.exception.PlanExecutionException
+import org.apache.celeborn.verifier.protocol._
+
+class Scheduler(override val rpcEnv: RpcEnv, val conf: VerifierConf)
+  extends RpcEndpoint with Logging {
+
+  // key:runnerId -> (resource, time, rpcEndpoint)
+  private val runnerInfos = new ConcurrentHashMap[String, RunnerInfo]
+  private val planExecutor =
+    
ThreadUtils.newDaemonSingleThreadExecutor("verifier-scheduler-plan-executor")
+  private val timeoutChecker =
+    
ThreadUtils.newDaemonSingleThreadScheduledExecutor("verifier-scheduler-timeout-checker")
+  private val resultExecutor =
+    ThreadUtils.newDaemonFixedThreadPool(8, 
"verifier-scheduler-result-executor")
+  private val timeoutThreshold = VerifierConf.runnerTimeOutMs(conf)
+  private val stopPlanFlag = new AtomicBoolean(true)
+  private val pausePlanFlag = new AtomicBoolean(false)
+  private val runningPlan = new AtomicReference[VerificationPlan](null)
+  private var planExecuteTask: Future[_] = _
+  private var timeoutCheckTask: ScheduledFuture[_] = _
+
+  override def onStart(): Unit = {
+
+    class PlanLoop extends Runnable {
+      override def run(): Unit = {
+        try {
+          implicit val contextExecutor: ExecutionContextExecutor =
+            ExecutionContext.fromExecutor(resultExecutor)
+          val schedulerContext = new SchedulerContext(conf, runnerInfos)
+          while (true) {
+            val plan = runningPlan.get()
+            if (plan == null) {
+              Thread.sleep(1000)
+            } else {
+              val trigger = plan.trigger
+              val actions = plan.actions
+              val checker = plan.checker
+              val actionCount = actions.size
+              var actionIndex = 0
+              var planLoopCount = 1
+              val tendentiousActions = actions.groupBy(_.tendency)
+              while (!stopPlanFlag.get()) {
+                if (pausePlanFlag.get()) {
+                  Thread.sleep(1000L)
+                }
+                val actionToExecute =
+                  if (trigger.random) {
+                    val actionCandidates = tendentiousActions(checker.tendency)
+                      .filter(ac => 
checker.availableTargets.contains(ac.target))
+                    actionCandidates(Random.nextInt(actionCandidates.size))
+                  } else {
+                    logInfo(s"Plan loop: $planLoopCount.")
+                    actions(actionIndex)
+                  }
+                val operations = 
actionToExecute.generateOperations(schedulerContext)
+                logDebug(
+                  s"Current action: $actionToExecute, operations: 
${operations.map(_.toString)}.")
+                val permissionToExecute =
+                  if (operations.isEmpty) {
+                    false
+                  } else {
+                    val deducedContext = 
actionToExecute.deduce(schedulerContext, operations)
+                    val permissionToRun = checker.validate(deducedContext)
+                    logDebug(s"Current context: ${schedulerContext.toString}  
=> deduced context: ${deducedContext.toString}, " +
+                      s"current action executes $permissionToRun.")
+                    permissionToRun
+                  }
+                if (permissionToExecute) {
+                  def executeOperationSync(operation: Operation): Unit = {
+                    val executeOperationFuture =
+                      
runnerInfos.get(operation.actionTarget.identity).rpcEndpointRef
+                        
.ask[ExecuteOperationResponse](ExecuteOperation(operation))
+                    val executeOperationResponse =
+                      ThreadUtils.awaitReady(executeOperationFuture, 
Duration.Inf).value.get.get
+                    if (executeOperationResponse.statusCode != 
StatusCode.SUCCESS) {
+                      throw new PlanExecutionException("Operation failed, plan 
execution abort.")
+                    }
+                    operation.updateSchedulerContext(schedulerContext)
+                    Thread.sleep(operation.interval)
+                  }
+                  operations.foreach(executeOperationSync)
+                  Thread.sleep(actionToExecute.interval)
+                }
+                if (trigger.sequence) {
+                  actionIndex = actionIndex + 1
+                  if (actionIndex >= actionCount - 1) {
+                    planLoopCount = planLoopCount + 1
+                    if (planLoopCount > trigger.repeat) {
+                      logInfo("Plan execution completed, cleaning verification 
plan.")
+                      stopPlanFlag.set(true)
+                      runningPlan.set(null)
+                    } else {
+                      actionIndex = 0
+                    }
+                  }
+                }
+                Thread.sleep(trigger.interval.getInterval)
+              }
+            }
+          }
+        } catch {
+          case t: Throwable =>
+            try {
+              logError("Plan executed with exception, cleaning verification 
plan.", t)
+              planExecuteTask = planExecutor.submit(new PlanLoop)
+              stopPlanFlag.set(true)
+              runningPlan.set(null)

Review Comment:
   QQ: Why are we resubmitting the `PlanLoop` ? (given we have set  
`stopPlanFlag` to `true` anyway)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to