Re: [PR] [Feature] Support spark job status tracking [incubator-streampark]

via GitHub Thu, 18 Jul 2024 23:15:26 -0700


ChengJie1053 commented on code in PR #3843:
URL: 
https://github.com/apache/incubator-streampark/pull/3843#discussion_r1683854458



##########
streampark-console/streampark-console-service/src/main/java/org/apache/streampark/console/core/watcher/SparkAppHttpWatcher.java:
##########
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.streampark.console.core.watcher;
+
+import org.apache.streampark.common.util.YarnUtils;
+import org.apache.streampark.console.base.util.JacksonUtils;
+import org.apache.streampark.console.base.util.Tuple2;
+import org.apache.streampark.console.base.util.Tuple3;
+import org.apache.streampark.console.core.bean.AlertTemplate;
+import org.apache.streampark.console.core.entity.SparkApplication;
+import org.apache.streampark.console.core.enums.SparkAppStateEnum;
+import org.apache.streampark.console.core.enums.SparkOptionStateEnum;
+import org.apache.streampark.console.core.enums.StopFromEnum;
+import org.apache.streampark.console.core.metrics.spark.Job;
+import org.apache.streampark.console.core.metrics.spark.SparkExecutor;
+import org.apache.streampark.console.core.metrics.yarn.YarnAppInfo;
+import org.apache.streampark.console.core.service.alert.AlertService;
+import 
org.apache.streampark.console.core.service.application.SparkApplicationActionService;
+import 
org.apache.streampark.console.core.service.application.SparkApplicationInfoService;
+import 
org.apache.streampark.console.core.service.application.SparkApplicationManageService;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.hc.core5.util.Timeout;
+
+import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
+import com.github.benmanes.caffeine.cache.Cache;
+import com.github.benmanes.caffeine.cache.Caffeine;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.scheduling.annotation.Scheduled;
+import org.springframework.stereotype.Component;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import javax.annotation.PostConstruct;
+import javax.annotation.PreDestroy;
+
+import java.io.IOException;
+import java.time.Duration;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Date;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.Executor;
+import java.util.concurrent.TimeUnit;
+
+@Slf4j
+@Component
+public class SparkAppHttpWatcher {
+
+    @Autowired
+    private SparkApplicationManageService applicationManageService;
+
+    @Autowired
+    private SparkApplicationActionService applicationActionService;
+
+    @Autowired
+    private SparkApplicationInfoService applicationInfoService;
+
+    @Autowired
+    private AlertService alertService;
+
+    @Qualifier("sparkRestAPIWatchingExecutor")
+    @Autowired
+    private Executor executorService;
+
+    // track interval every 5 seconds
+    public static final Duration WATCHING_INTERVAL = Duration.ofSeconds(5);
+
+    // option interval within 10 seconds
+    private static final Duration OPTION_INTERVAL = Duration.ofSeconds(10);
+
+    private static final Timeout HTTP_TIMEOUT = Timeout.ofSeconds(5);
+
+    /**
+     * Record the status of the first tracking task, because after the task is 
started, the overview
+     * of the task will be obtained during the first tracking
+     */
+    private static final Cache<Long, Byte> STARTING_CACHE =
+        Caffeine.newBuilder().expireAfterWrite(5, TimeUnit.MINUTES).build();
+
+    /** tracking task list */
+    private static final Map<Long, SparkApplication> WATCHING_APPS = new 
ConcurrentHashMap<>(0);
+
+    /**
+     * <pre>
+     * StopFrom: Recording spark application stopped by streampark or stopped 
by other actions
+     * </pre>
+     */
+    private static final Map<Long, StopFromEnum> STOP_FROM_MAP = new 
ConcurrentHashMap<>(0);
+
+    /**
+     * Task canceled tracking list, record who cancelled the tracking task 
Map<applicationId,userId>
+     */
+    private static final Map<Long, Long> CANCELLED_JOB_MAP = new 
ConcurrentHashMap<>(0);
+
+    private static final Map<Long, SparkOptionStateEnum> OPTIONING = new 
ConcurrentHashMap<>(0);
+
+    private Long lastWatchTime = 0L;
+
+    private Long lastOptionTime = 0L;
+
+    private static final Byte DEFAULT_FLAG_BYTE = Byte.valueOf("0");
+
+    @PostConstruct
+    public void init() {
+        WATCHING_APPS.clear();
+        List<SparkApplication> applications =
+            applicationManageService.list(
+                new LambdaQueryWrapper<SparkApplication>()
+                    .eq(SparkApplication::getTracking, 1)
+                    .ne(SparkApplication::getState, 
SparkAppStateEnum.LOST.getValue()));
+        applications.forEach(
+            (app) -> {
+                WATCHING_APPS.put(app.getId(), app);
+                STARTING_CACHE.put(app.getId(), DEFAULT_FLAG_BYTE);
+            });
+    }
+
+    @PreDestroy
+    public void doStop() {
+        log.info(
+            "[StreamPark][SparkAppHttpWatcher] StreamPark Console will be 
shutdown, persistent application to database.");
+        WATCHING_APPS.forEach((k, v) -> 
applicationManageService.persistMetrics(v));
+    }
+
+    /**
+     * <strong>NOTE: The following conditions must be met for 
execution</strong>
+     *
+     * <p><strong>1) Program started or page operated task, such as 
start/stop, needs to return the
+     * state immediately. (the frequency of 1 second once, continued 10 
seconds (10 times))</strong>
+     *
+     * <p><strong>2) Normal information obtain, once every 5 seconds</strong>
+     */
+    @Scheduled(fixedDelay = 1000)
+    public void start() {
+        Long timeMillis = System.currentTimeMillis();
+        if (lastWatchTime == null
+            || !OPTIONING.isEmpty()
+            || timeMillis - lastOptionTime <= OPTION_INTERVAL.toMillis()
+            || timeMillis - lastWatchTime >= WATCHING_INTERVAL.toMillis()) {
+            lastWatchTime = timeMillis;
+            WATCHING_APPS.forEach(this::watch);
+        }
+    }
+
+    @VisibleForTesting
+    public @Nullable SparkAppStateEnum tryQuerySparkAppState(@Nonnull Long 
appId) {
+        SparkApplication app = WATCHING_APPS.get(appId);
+        return (app == null || app.getState() == null) ? null : 
app.getStateEnum();
+    }
+
+    private void watch(Long id, SparkApplication application) {
+        executorService.execute(
+            () -> {
+                try {
+                    getStateFromYarn(application);
+                } catch (Exception e) {
+                    throw new RuntimeException(e);
+                }
+            });
+    }
+
+    private StopFromEnum getAppStopFrom(Long appId) {
+        return STOP_FROM_MAP.getOrDefault(appId, StopFromEnum.NONE);

Review Comment:
   `getAppStopFrom` is not used



##########
streampark-console/streampark-console-service/src/main/java/org/apache/streampark/console/core/watcher/SparkAppHttpWatcher.java:
##########
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.streampark.console.core.watcher;
+
+import org.apache.streampark.common.util.YarnUtils;
+import org.apache.streampark.console.base.util.JacksonUtils;
+import org.apache.streampark.console.base.util.Tuple2;
+import org.apache.streampark.console.base.util.Tuple3;
+import org.apache.streampark.console.core.bean.AlertTemplate;
+import org.apache.streampark.console.core.entity.SparkApplication;
+import org.apache.streampark.console.core.enums.SparkAppStateEnum;
+import org.apache.streampark.console.core.enums.SparkOptionStateEnum;
+import org.apache.streampark.console.core.enums.StopFromEnum;
+import org.apache.streampark.console.core.metrics.spark.Job;
+import org.apache.streampark.console.core.metrics.spark.SparkExecutor;
+import org.apache.streampark.console.core.metrics.yarn.YarnAppInfo;
+import org.apache.streampark.console.core.service.alert.AlertService;
+import 
org.apache.streampark.console.core.service.application.SparkApplicationActionService;
+import 
org.apache.streampark.console.core.service.application.SparkApplicationInfoService;
+import 
org.apache.streampark.console.core.service.application.SparkApplicationManageService;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.hc.core5.util.Timeout;
+
+import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
+import com.github.benmanes.caffeine.cache.Cache;
+import com.github.benmanes.caffeine.cache.Caffeine;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.scheduling.annotation.Scheduled;
+import org.springframework.stereotype.Component;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import javax.annotation.PostConstruct;
+import javax.annotation.PreDestroy;
+
+import java.io.IOException;
+import java.time.Duration;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Date;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.Executor;
+import java.util.concurrent.TimeUnit;
+
+@Slf4j
+@Component
+public class SparkAppHttpWatcher {
+
+    @Autowired
+    private SparkApplicationManageService applicationManageService;
+
+    @Autowired
+    private SparkApplicationActionService applicationActionService;
+
+    @Autowired
+    private SparkApplicationInfoService applicationInfoService;
+
+    @Autowired
+    private AlertService alertService;
+
+    @Qualifier("sparkRestAPIWatchingExecutor")
+    @Autowired
+    private Executor executorService;
+
+    // track interval every 5 seconds
+    public static final Duration WATCHING_INTERVAL = Duration.ofSeconds(5);
+
+    // option interval within 10 seconds
+    private static final Duration OPTION_INTERVAL = Duration.ofSeconds(10);
+
+    private static final Timeout HTTP_TIMEOUT = Timeout.ofSeconds(5);
+
+    /**
+     * Record the status of the first tracking task, because after the task is 
started, the overview
+     * of the task will be obtained during the first tracking
+     */
+    private static final Cache<Long, Byte> STARTING_CACHE =
+        Caffeine.newBuilder().expireAfterWrite(5, TimeUnit.MINUTES).build();
+
+    /** tracking task list */
+    private static final Map<Long, SparkApplication> WATCHING_APPS = new 
ConcurrentHashMap<>(0);
+
+    /**
+     * <pre>
+     * StopFrom: Recording spark application stopped by streampark or stopped 
by other actions
+     * </pre>
+     */
+    private static final Map<Long, StopFromEnum> STOP_FROM_MAP = new 
ConcurrentHashMap<>(0);
+
+    /**
+     * Task canceled tracking list, record who cancelled the tracking task 
Map<applicationId,userId>
+     */
+    private static final Map<Long, Long> CANCELLED_JOB_MAP = new 
ConcurrentHashMap<>(0);
+
+    private static final Map<Long, SparkOptionStateEnum> OPTIONING = new 
ConcurrentHashMap<>(0);
+
+    private Long lastWatchTime = 0L;
+
+    private Long lastOptionTime = 0L;
+
+    private static final Byte DEFAULT_FLAG_BYTE = Byte.valueOf("0");
+
+    @PostConstruct
+    public void init() {
+        WATCHING_APPS.clear();
+        List<SparkApplication> applications =
+            applicationManageService.list(
+                new LambdaQueryWrapper<SparkApplication>()
+                    .eq(SparkApplication::getTracking, 1)
+                    .ne(SparkApplication::getState, 
SparkAppStateEnum.LOST.getValue()));
+        applications.forEach(
+            (app) -> {
+                WATCHING_APPS.put(app.getId(), app);
+                STARTING_CACHE.put(app.getId(), DEFAULT_FLAG_BYTE);
+            });
+    }
+
+    @PreDestroy
+    public void doStop() {
+        log.info(
+            "[StreamPark][SparkAppHttpWatcher] StreamPark Console will be 
shutdown, persistent application to database.");
+        WATCHING_APPS.forEach((k, v) -> 
applicationManageService.persistMetrics(v));
+    }
+
+    /**
+     * <strong>NOTE: The following conditions must be met for 
execution</strong>
+     *
+     * <p><strong>1) Program started or page operated task, such as 
start/stop, needs to return the
+     * state immediately. (the frequency of 1 second once, continued 10 
seconds (10 times))</strong>
+     *
+     * <p><strong>2) Normal information obtain, once every 5 seconds</strong>
+     */
+    @Scheduled(fixedDelay = 1000)
+    public void start() {
+        Long timeMillis = System.currentTimeMillis();
+        if (lastWatchTime == null
+            || !OPTIONING.isEmpty()
+            || timeMillis - lastOptionTime <= OPTION_INTERVAL.toMillis()
+            || timeMillis - lastWatchTime >= WATCHING_INTERVAL.toMillis()) {
+            lastWatchTime = timeMillis;
+            WATCHING_APPS.forEach(this::watch);
+        }
+    }
+
+    @VisibleForTesting
+    public @Nullable SparkAppStateEnum tryQuerySparkAppState(@Nonnull Long 
appId) {
+        SparkApplication app = WATCHING_APPS.get(appId);
+        return (app == null || app.getState() == null) ? null : 
app.getStateEnum();
+    }
+
+    private void watch(Long id, SparkApplication application) {
+        executorService.execute(
+            () -> {
+                try {
+                    getStateFromYarn(application);
+                } catch (Exception e) {
+                    throw new RuntimeException(e);
+                }
+            });
+    }
+
+    private StopFromEnum getAppStopFrom(Long appId) {
+        return STOP_FROM_MAP.getOrDefault(appId, StopFromEnum.NONE);
+    }
+
+    /**
+     * Query the job state from yarn and query the resource usage from spark 
when job state is RUNNING
+     *
+     * @param application spark application
+     */
+    private void getStateFromYarn(SparkApplication application) throws 
Exception {
+        SparkOptionStateEnum optionStateEnum = 
OPTIONING.get(application.getId());
+
+        // query the status from the yarn rest Api
+        YarnAppInfo yarnAppInfo = httpYarnAppInfo(application);
+        if (yarnAppInfo == null) {
+            throw new RuntimeException("[StreamPark][SparkAppHttpWatcher] 
getStateFromYarn failed!");
+        } else {
+            try {
+                String state = yarnAppInfo.getApp().getState();
+                SparkAppStateEnum sparkAppStateEnum = 
SparkAppStateEnum.of(state);
+                if (SparkAppStateEnum.OTHER == sparkAppStateEnum) {
+                    return;
+                }
+                if 
(SparkAppStateEnum.isEndState(sparkAppStateEnum.getValue())) {
+                    log.info(
+                        "[StreamPark][SparkAppHttpWatcher] getStateFromYarn, 
app {} was ended, jobId is {}, state is {}",
+                        application.getId(),
+                        application.getJobId(),
+                        sparkAppStateEnum);
+                    application.setEndTime(new Date());
+                }
+                if (SparkAppStateEnum.RUNNING == sparkAppStateEnum) {
+                    Tuple3<Double, Double, Long> resourceStatus = 
getResourceStatus(application);
+                    double memoryUsed = resourceStatus.t1;
+                    double maxMemory = resourceStatus.t2;
+                    double totalCores = resourceStatus.t3;
+                    log.info(
+                        "[StreamPark][SparkAppHttpWatcher] getStateFromYarn, 
app {} was running, jobId is {}, memoryUsed: {}MB, maxMemory: {}MB, totalCores: 
{}",
+                        application.getId(),
+                        application.getJobId(),
+                        String.format("%.2f", memoryUsed),
+                        String.format("%.2f", maxMemory),
+                        totalCores);
+                    // TODO: Modify the table structure to persist the results
+                }
+                application.setState(sparkAppStateEnum.getValue());
+                cleanOptioning(optionStateEnum, application.getId());
+                doPersistMetrics(application, false);
+                if (SparkAppStateEnum.FAILED == sparkAppStateEnum
+                    || SparkAppStateEnum.LOST == sparkAppStateEnum
+                    || applicationInfoService.checkAlter(application)) {
+                    doAlert(application, sparkAppStateEnum);
+                    if (SparkAppStateEnum.FAILED == sparkAppStateEnum) {
+                        applicationActionService.start(application, true);
+                    }
+                }
+            } catch (Exception e) {
+                throw new RuntimeException("[StreamPark][SparkAppHttpWatcher] 
getStateFromYarn failed!");
+            }
+        }
+    }
+
+    /**
+     * Calculate spark task progress from Spark rest api. (proxyed by yarn) 
Only available when yarn
+     * application status is RUNNING.
+     *
+     * @param application
+     * @return task progress
+     * @throws Exception
+     */
+    private double getTasksProgress(SparkApplication application) throws 
Exception {
+        Job[] jobs = httpJobsStatus(application);

Review Comment:
   `getTasksProgress`  is not used



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [Feature] Support spark job status tracking [incubator-streampark]

Reply via email to