Re: [PR] [Feature] Support spark job status tracking [incubator-streampark]

via GitHub Thu, 18 Jul 2024 22:57:45 -0700


ChengJie1053 commented on code in PR #3843:
URL: 
https://github.com/apache/incubator-streampark/pull/3843#discussion_r1683824406



##########
streampark-console/streampark-console-service/src/main/java/org/apache/streampark/console/core/service/application/impl/SparkApplicationActionServiceImpl.java:
##########
@@ -508,8 +444,7 @@ private Tuple2<String, String> getUserJarAndAppConf(
                 FlinkSql flinkSql = 
flinkSqlService.getEffective(application.getId(), false);
                 AssertUtils.notNull(flinkSql);

Review Comment:
   Does sparksql need to be distinguished from FlinkSql and stored in a 
different table
   



##########
streampark-console/streampark-console-service/src/main/java/org/apache/streampark/console/core/service/application/impl/SparkApplicationActionServiceImpl.java:
##########
@@ -508,8 +444,7 @@ private Tuple2<String, String> getUserJarAndAppConf(
                 FlinkSql flinkSql = 
flinkSqlService.getEffective(application.getId(), false);
                 AssertUtils.notNull(flinkSql);

Review Comment:
   
![image](https://github.com/user-attachments/assets/7f521e07-7403-4735-9507-449fec23018e)
   Does it need to be modified here
   



##########
streampark-console/streampark-console-service/src/main/java/org/apache/streampark/console/core/service/application/impl/SparkApplicationActionServiceImpl.java:
##########
@@ -417,16 +352,17 @@ public void start(SparkApplication appParam, boolean 
auto) throws Exception {
                         doStopped(application.getId());
                     } else {
                         SparkApplication app = getById(appParam.getId());
-                        app.setState(FlinkAppStateEnum.FAILED.getValue());
-                        app.setOptionState(OptionStateEnum.NONE.getValue());
+                        app.setState(SparkAppStateEnum.FAILED.getValue());
+                        
app.setOptionState(SparkOptionStateEnum.NONE.getValue());
                         updateById(app);
-                        FlinkAppHttpWatcher.unWatching(appParam.getId());
+                        SparkAppHttpWatcher.unWatching(appParam.getId());
                     }
                     return;
                 }
 
                 // 3) success
                 applicationLog.setSuccess(true);
+                // TODO：修改为spark对应的参数

Review Comment:
   Does this place need to be modified or modified in another pr
   



##########
streampark-spark/streampark-spark-client/streampark-spark-client-core/src/main/scala/org/apache/streampark/spark/client/impl/YarnApplicationClient.scala:
##########
@@ -18,107 +18,100 @@
 package org.apache.streampark.spark.client.impl
 
 import org.apache.streampark.common.conf.Workspace
+import org.apache.streampark.common.enums.SparkExecutionMode
+import org.apache.streampark.common.util.HadoopUtils
 import org.apache.streampark.flink.packer.pipeline.ShadedBuildResponse
 import org.apache.streampark.spark.client.`trait`.SparkClientTrait
 import org.apache.streampark.spark.client.bean._
+import org.apache.streampark.spark.client.conf.SparkConfiguration
 
 import org.apache.commons.collections.MapUtils
+import org.apache.hadoop.yarn.api.records.ApplicationId
 import org.apache.spark.launcher.{SparkAppHandle, SparkLauncher}
 
 import java.util.concurrent.{CountDownLatch, Executors, ExecutorService}
 
-import scala.util.control.Breaks.break
-
 /** yarn application mode submit */
 object YarnApplicationClient extends SparkClientTrait {
 
   private val threadPool: ExecutorService = Executors.newFixedThreadPool(1)
 
   private[this] lazy val workspace = Workspace.remote
 
-  override def doCancel(cancelRequest: CancelRequest): CancelResponse = {
+  override def doStop(stopRequest: StopRequest): StopResponse = {
+    
HadoopUtils.yarnClient.killApplication(ApplicationId.fromString(stopRequest.jobId))
     null
   }
 
   override def setConfig(submitRequest: SubmitRequest): Unit = {}
 
   override def doSubmit(submitRequest: SubmitRequest): SubmitResponse = {
     launch(submitRequest)
-    null
-
   }
 
-  private def launch(submitRequest: SubmitRequest): Unit = {
+  private def launch(submitRequest: SubmitRequest): SubmitResponse = {
     val launcher: SparkLauncher = new SparkLauncher()
       .setSparkHome(submitRequest.sparkVersion.sparkHome)
       .setAppResource(submitRequest.buildResult
         .asInstanceOf[ShadedBuildResponse]
         .shadedJarPath)
       .setMainClass(submitRequest.appMain)
       .setMaster("yarn")
-      .setDeployMode("cluster")
+      .setDeployMode(submitRequest.executionMode match {
+        case SparkExecutionMode.YARN_CLIENT => "client"
+        case SparkExecutionMode.YARN_CLUSTER => "cluster"
+        case _ =>
+          throw new IllegalArgumentException(
+            "[StreamPark][YarnApplicationClient] Yarn mode only support 
\"client\" and \"cluster\".")
+
+      })
       .setAppName(submitRequest.appName)
-      .setConf("spark.executor.memory", "5g")
-      .setConf("spark.executor.cores", "4")
-      .setConf("spark.num.executors", "1")
       .setConf(
         "spark.yarn.jars",
         submitRequest
-          .asInstanceOf[SubmitRequest]
           .hdfsWorkspace
           .sparkLib + "/*.jar")
       .setVerbose(true)
 
+    import scala.collection.JavaConverters._
+    setDynamicProperties(launcher, submitRequest.properties.asScala.toMap)
+
+    // TODO: Adds command line arguments for the application.
+    // launcher.addAppArgs()

Review Comment:
   Whether this place needs to be modified？



##########
streampark-console/streampark-console-service/src/main/java/org/apache/streampark/console/core/watcher/SparkAppHttpWatcher.java:
##########
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.streampark.console.core.watcher;
+
+import org.apache.streampark.common.util.YarnUtils;
+import org.apache.streampark.console.base.util.JacksonUtils;
+import org.apache.streampark.console.base.util.Tuple2;
+import org.apache.streampark.console.base.util.Tuple3;
+import org.apache.streampark.console.core.bean.AlertTemplate;
+import org.apache.streampark.console.core.entity.SparkApplication;
+import org.apache.streampark.console.core.enums.SparkAppStateEnum;
+import org.apache.streampark.console.core.enums.SparkOptionStateEnum;
+import org.apache.streampark.console.core.enums.StopFromEnum;
+import org.apache.streampark.console.core.metrics.spark.Job;
+import org.apache.streampark.console.core.metrics.spark.SparkExecutor;
+import org.apache.streampark.console.core.metrics.yarn.YarnAppInfo;
+import org.apache.streampark.console.core.service.alert.AlertService;
+import 
org.apache.streampark.console.core.service.application.SparkApplicationActionService;
+import 
org.apache.streampark.console.core.service.application.SparkApplicationInfoService;
+import 
org.apache.streampark.console.core.service.application.SparkApplicationManageService;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.hc.core5.util.Timeout;
+
+import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
+import com.github.benmanes.caffeine.cache.Cache;
+import com.github.benmanes.caffeine.cache.Caffeine;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.scheduling.annotation.Scheduled;
+import org.springframework.stereotype.Component;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import javax.annotation.PostConstruct;
+import javax.annotation.PreDestroy;
+
+import java.io.IOException;
+import java.time.Duration;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Date;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.Executor;
+import java.util.concurrent.TimeUnit;
+
+@Slf4j
+@Component
+public class SparkAppHttpWatcher {
+
+    @Autowired
+    private SparkApplicationManageService applicationManageService;
+
+    @Autowired
+    private SparkApplicationActionService applicationActionService;
+
+    @Autowired
+    private SparkApplicationInfoService applicationInfoService;
+
+    @Autowired
+    private AlertService alertService;
+
+    @Qualifier("sparkRestAPIWatchingExecutor")
+    @Autowired
+    private Executor executorService;
+
+    // track interval every 5 seconds
+    public static final Duration WATCHING_INTERVAL = Duration.ofSeconds(5);
+
+    // option interval within 10 seconds
+    private static final Duration OPTION_INTERVAL = Duration.ofSeconds(10);
+
+    private static final Timeout HTTP_TIMEOUT = Timeout.ofSeconds(5);
+
+    /**
+     * Record the status of the first tracking task, because after the task is 
started, the overview
+     * of the task will be obtained during the first tracking
+     */
+    private static final Cache<Long, Byte> STARTING_CACHE =
+        Caffeine.newBuilder().expireAfterWrite(5, TimeUnit.MINUTES).build();
+
+    /** tracking task list */
+    private static final Map<Long, SparkApplication> WATCHING_APPS = new 
ConcurrentHashMap<>(0);
+
+    /**
+     * <pre>
+     * StopFrom: Recording spark application stopped by streampark or stopped 
by other actions
+     * </pre>
+     */
+    private static final Map<Long, StopFromEnum> STOP_FROM_MAP = new 
ConcurrentHashMap<>(0);
+
+    /**
+     * Task canceled tracking list, record who cancelled the tracking task 
Map<applicationId,userId>
+     */
+    private static final Map<Long, Long> CANCELLED_JOB_MAP = new 
ConcurrentHashMap<>(0);
+
+    private static final Map<Long, SparkOptionStateEnum> OPTIONING = new 
ConcurrentHashMap<>(0);
+
+    private Long lastWatchTime = 0L;
+
+    private Long lastOptionTime = 0L;
+
+    private static final Byte DEFAULT_FLAG_BYTE = Byte.valueOf("0");
+
+    @PostConstruct
+    public void init() {
+        WATCHING_APPS.clear();
+        List<SparkApplication> applications =
+            applicationManageService.list(
+                new LambdaQueryWrapper<SparkApplication>()
+                    .eq(SparkApplication::getTracking, 1)
+                    .ne(SparkApplication::getState, 
SparkAppStateEnum.LOST.getValue()));
+        applications.forEach(
+            (app) -> {
+                WATCHING_APPS.put(app.getId(), app);
+                STARTING_CACHE.put(app.getId(), DEFAULT_FLAG_BYTE);
+            });
+    }
+
+    @PreDestroy
+    public void doStop() {
+        log.info(
+            "[StreamPark][SparkAppHttpWatcher] StreamPark Console will be 
shutdown, persistent application to database.");
+        WATCHING_APPS.forEach((k, v) -> 
applicationManageService.persistMetrics(v));
+    }
+
+    /**
+     * <strong>NOTE: The following conditions must be met for 
execution</strong>
+     *
+     * <p><strong>1) Program started or page operated task, such as 
start/stop, needs to return the
+     * state immediately. (the frequency of 1 second once, continued 10 
seconds (10 times))</strong>
+     *
+     * <p><strong>2) Normal information obtain, once every 5 seconds</strong>
+     */
+    @Scheduled(fixedDelay = 1000)
+    public void start() {
+        Long timeMillis = System.currentTimeMillis();
+        if (lastWatchTime == null
+            || !OPTIONING.isEmpty()
+            || timeMillis - lastOptionTime <= OPTION_INTERVAL.toMillis()
+            || timeMillis - lastWatchTime >= WATCHING_INTERVAL.toMillis()) {
+            lastWatchTime = timeMillis;
+            WATCHING_APPS.forEach(this::watch);
+        }
+    }
+
+    @VisibleForTesting
+    public @Nullable SparkAppStateEnum tryQuerySparkAppState(@Nonnull Long 
appId) {
+        SparkApplication app = WATCHING_APPS.get(appId);
+        return (app == null || app.getState() == null) ? null : 
app.getStateEnum();
+    }
+
+    private void watch(Long id, SparkApplication application) {
+        executorService.execute(
+            () -> {
+                try {
+                    getStateFromYarn(application);
+                } catch (Exception e) {
+                    throw new RuntimeException(e);
+                }
+            });
+    }
+
+    private StopFromEnum getAppStopFrom(Long appId) {
+        return STOP_FROM_MAP.getOrDefault(appId, StopFromEnum.NONE);
+    }
+
+    /**
+     * Query the job state from yarn and query the resource usage from spark 
when job state is RUNNING
+     *
+     * @param application spark application
+     */
+    private void getStateFromYarn(SparkApplication application) throws 
Exception {
+        SparkOptionStateEnum optionStateEnum = 
OPTIONING.get(application.getId());
+
+        // query the status from the yarn rest Api
+        YarnAppInfo yarnAppInfo = httpYarnAppInfo(application);
+        if (yarnAppInfo == null) {
+            throw new RuntimeException("[StreamPark][SparkAppHttpWatcher] 
getStateFromYarn failed!");
+        } else {
+            try {
+                String state = yarnAppInfo.getApp().getState();
+                SparkAppStateEnum sparkAppStateEnum = 
SparkAppStateEnum.of(state);
+                if (SparkAppStateEnum.OTHER == sparkAppStateEnum) {
+                    return;
+                }
+                if 
(SparkAppStateEnum.isEndState(sparkAppStateEnum.getValue())) {
+                    log.info(
+                        "[StreamPark][SparkAppHttpWatcher] getStateFromYarn, 
app {} was ended, jobId is {}, state is {}",
+                        application.getId(),
+                        application.getJobId(),
+                        sparkAppStateEnum);
+                    application.setEndTime(new Date());
+                }
+                if (SparkAppStateEnum.RUNNING == sparkAppStateEnum) {
+                    Tuple3<Double, Double, Long> resourceStatus = 
getResourceStatus(application);
+                    double memoryUsed = resourceStatus.t1;
+                    double maxMemory = resourceStatus.t2;
+                    double totalCores = resourceStatus.t3;
+                    log.info(
+                        "[StreamPark][SparkAppHttpWatcher] getStateFromYarn, 
app {} was running, jobId is {}, memoryUsed: {}MB, maxMemory: {}MB, totalCores: 
{}",
+                        application.getId(),
+                        application.getJobId(),
+                        String.format("%.2f", memoryUsed),
+                        String.format("%.2f", maxMemory),
+                        totalCores);
+                    // TODO: Modify the table structure to persist the results

Review Comment:
   Do you need to delete?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [Feature] Support spark job status tracking [incubator-streampark]

Reply via email to