ChengJie1053 commented on code in PR #3843: URL: https://github.com/apache/incubator-streampark/pull/3843#discussion_r1683854458
########## streampark-console/streampark-console-service/src/main/java/org/apache/streampark/console/core/watcher/SparkAppHttpWatcher.java: ########## @@ -0,0 +1,389 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.streampark.console.core.watcher; + +import org.apache.streampark.common.util.YarnUtils; +import org.apache.streampark.console.base.util.JacksonUtils; +import org.apache.streampark.console.base.util.Tuple2; +import org.apache.streampark.console.base.util.Tuple3; +import org.apache.streampark.console.core.bean.AlertTemplate; +import org.apache.streampark.console.core.entity.SparkApplication; +import org.apache.streampark.console.core.enums.SparkAppStateEnum; +import org.apache.streampark.console.core.enums.SparkOptionStateEnum; +import org.apache.streampark.console.core.enums.StopFromEnum; +import org.apache.streampark.console.core.metrics.spark.Job; +import org.apache.streampark.console.core.metrics.spark.SparkExecutor; +import org.apache.streampark.console.core.metrics.yarn.YarnAppInfo; +import org.apache.streampark.console.core.service.alert.AlertService; +import org.apache.streampark.console.core.service.application.SparkApplicationActionService; +import org.apache.streampark.console.core.service.application.SparkApplicationInfoService; +import org.apache.streampark.console.core.service.application.SparkApplicationManageService; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.hc.core5.util.Timeout; + +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.scheduling.annotation.Scheduled; +import org.springframework.stereotype.Component; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import javax.annotation.PostConstruct; +import javax.annotation.PreDestroy; + +import java.io.IOException; +import java.time.Duration; +import java.util.Arrays; +import java.util.Collection; +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Executor; +import java.util.concurrent.TimeUnit; + +@Slf4j +@Component +public class SparkAppHttpWatcher { + + @Autowired + private SparkApplicationManageService applicationManageService; + + @Autowired + private SparkApplicationActionService applicationActionService; + + @Autowired + private SparkApplicationInfoService applicationInfoService; + + @Autowired + private AlertService alertService; + + @Qualifier("sparkRestAPIWatchingExecutor") + @Autowired + private Executor executorService; + + // track interval every 5 seconds + public static final Duration WATCHING_INTERVAL = Duration.ofSeconds(5); + + // option interval within 10 seconds + private static final Duration OPTION_INTERVAL = Duration.ofSeconds(10); + + private static final Timeout HTTP_TIMEOUT = Timeout.ofSeconds(5); + + /** + * Record the status of the first tracking task, because after the task is started, the overview + * of the task will be obtained during the first tracking + */ + private static final Cache<Long, Byte> STARTING_CACHE = + Caffeine.newBuilder().expireAfterWrite(5, TimeUnit.MINUTES).build(); + + /** tracking task list */ + private static final Map<Long, SparkApplication> WATCHING_APPS = new ConcurrentHashMap<>(0); + + /** + * <pre> + * StopFrom: Recording spark application stopped by streampark or stopped by other actions + * </pre> + */ + private static final Map<Long, StopFromEnum> STOP_FROM_MAP = new ConcurrentHashMap<>(0); + + /** + * Task canceled tracking list, record who cancelled the tracking task Map<applicationId,userId> + */ + private static final Map<Long, Long> CANCELLED_JOB_MAP = new ConcurrentHashMap<>(0); + + private static final Map<Long, SparkOptionStateEnum> OPTIONING = new ConcurrentHashMap<>(0); + + private Long lastWatchTime = 0L; + + private Long lastOptionTime = 0L; + + private static final Byte DEFAULT_FLAG_BYTE = Byte.valueOf("0"); + + @PostConstruct + public void init() { + WATCHING_APPS.clear(); + List<SparkApplication> applications = + applicationManageService.list( + new LambdaQueryWrapper<SparkApplication>() + .eq(SparkApplication::getTracking, 1) + .ne(SparkApplication::getState, SparkAppStateEnum.LOST.getValue())); + applications.forEach( + (app) -> { + WATCHING_APPS.put(app.getId(), app); + STARTING_CACHE.put(app.getId(), DEFAULT_FLAG_BYTE); + }); + } + + @PreDestroy + public void doStop() { + log.info( + "[StreamPark][SparkAppHttpWatcher] StreamPark Console will be shutdown, persistent application to database."); + WATCHING_APPS.forEach((k, v) -> applicationManageService.persistMetrics(v)); + } + + /** + * <strong>NOTE: The following conditions must be met for execution</strong> + * + * <p><strong>1) Program started or page operated task, such as start/stop, needs to return the + * state immediately. (the frequency of 1 second once, continued 10 seconds (10 times))</strong> + * + * <p><strong>2) Normal information obtain, once every 5 seconds</strong> + */ + @Scheduled(fixedDelay = 1000) + public void start() { + Long timeMillis = System.currentTimeMillis(); + if (lastWatchTime == null + || !OPTIONING.isEmpty() + || timeMillis - lastOptionTime <= OPTION_INTERVAL.toMillis() + || timeMillis - lastWatchTime >= WATCHING_INTERVAL.toMillis()) { + lastWatchTime = timeMillis; + WATCHING_APPS.forEach(this::watch); + } + } + + @VisibleForTesting + public @Nullable SparkAppStateEnum tryQuerySparkAppState(@Nonnull Long appId) { + SparkApplication app = WATCHING_APPS.get(appId); + return (app == null || app.getState() == null) ? null : app.getStateEnum(); + } + + private void watch(Long id, SparkApplication application) { + executorService.execute( + () -> { + try { + getStateFromYarn(application); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + } + + private StopFromEnum getAppStopFrom(Long appId) { + return STOP_FROM_MAP.getOrDefault(appId, StopFromEnum.NONE); Review Comment: `getAppStopFrom` is not used ########## streampark-console/streampark-console-service/src/main/java/org/apache/streampark/console/core/watcher/SparkAppHttpWatcher.java: ########## @@ -0,0 +1,389 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.streampark.console.core.watcher; + +import org.apache.streampark.common.util.YarnUtils; +import org.apache.streampark.console.base.util.JacksonUtils; +import org.apache.streampark.console.base.util.Tuple2; +import org.apache.streampark.console.base.util.Tuple3; +import org.apache.streampark.console.core.bean.AlertTemplate; +import org.apache.streampark.console.core.entity.SparkApplication; +import org.apache.streampark.console.core.enums.SparkAppStateEnum; +import org.apache.streampark.console.core.enums.SparkOptionStateEnum; +import org.apache.streampark.console.core.enums.StopFromEnum; +import org.apache.streampark.console.core.metrics.spark.Job; +import org.apache.streampark.console.core.metrics.spark.SparkExecutor; +import org.apache.streampark.console.core.metrics.yarn.YarnAppInfo; +import org.apache.streampark.console.core.service.alert.AlertService; +import org.apache.streampark.console.core.service.application.SparkApplicationActionService; +import org.apache.streampark.console.core.service.application.SparkApplicationInfoService; +import org.apache.streampark.console.core.service.application.SparkApplicationManageService; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.hc.core5.util.Timeout; + +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.scheduling.annotation.Scheduled; +import org.springframework.stereotype.Component; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import javax.annotation.PostConstruct; +import javax.annotation.PreDestroy; + +import java.io.IOException; +import java.time.Duration; +import java.util.Arrays; +import java.util.Collection; +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Executor; +import java.util.concurrent.TimeUnit; + +@Slf4j +@Component +public class SparkAppHttpWatcher { + + @Autowired + private SparkApplicationManageService applicationManageService; + + @Autowired + private SparkApplicationActionService applicationActionService; + + @Autowired + private SparkApplicationInfoService applicationInfoService; + + @Autowired + private AlertService alertService; + + @Qualifier("sparkRestAPIWatchingExecutor") + @Autowired + private Executor executorService; + + // track interval every 5 seconds + public static final Duration WATCHING_INTERVAL = Duration.ofSeconds(5); + + // option interval within 10 seconds + private static final Duration OPTION_INTERVAL = Duration.ofSeconds(10); + + private static final Timeout HTTP_TIMEOUT = Timeout.ofSeconds(5); + + /** + * Record the status of the first tracking task, because after the task is started, the overview + * of the task will be obtained during the first tracking + */ + private static final Cache<Long, Byte> STARTING_CACHE = + Caffeine.newBuilder().expireAfterWrite(5, TimeUnit.MINUTES).build(); + + /** tracking task list */ + private static final Map<Long, SparkApplication> WATCHING_APPS = new ConcurrentHashMap<>(0); + + /** + * <pre> + * StopFrom: Recording spark application stopped by streampark or stopped by other actions + * </pre> + */ + private static final Map<Long, StopFromEnum> STOP_FROM_MAP = new ConcurrentHashMap<>(0); + + /** + * Task canceled tracking list, record who cancelled the tracking task Map<applicationId,userId> + */ + private static final Map<Long, Long> CANCELLED_JOB_MAP = new ConcurrentHashMap<>(0); + + private static final Map<Long, SparkOptionStateEnum> OPTIONING = new ConcurrentHashMap<>(0); + + private Long lastWatchTime = 0L; + + private Long lastOptionTime = 0L; + + private static final Byte DEFAULT_FLAG_BYTE = Byte.valueOf("0"); + + @PostConstruct + public void init() { + WATCHING_APPS.clear(); + List<SparkApplication> applications = + applicationManageService.list( + new LambdaQueryWrapper<SparkApplication>() + .eq(SparkApplication::getTracking, 1) + .ne(SparkApplication::getState, SparkAppStateEnum.LOST.getValue())); + applications.forEach( + (app) -> { + WATCHING_APPS.put(app.getId(), app); + STARTING_CACHE.put(app.getId(), DEFAULT_FLAG_BYTE); + }); + } + + @PreDestroy + public void doStop() { + log.info( + "[StreamPark][SparkAppHttpWatcher] StreamPark Console will be shutdown, persistent application to database."); + WATCHING_APPS.forEach((k, v) -> applicationManageService.persistMetrics(v)); + } + + /** + * <strong>NOTE: The following conditions must be met for execution</strong> + * + * <p><strong>1) Program started or page operated task, such as start/stop, needs to return the + * state immediately. (the frequency of 1 second once, continued 10 seconds (10 times))</strong> + * + * <p><strong>2) Normal information obtain, once every 5 seconds</strong> + */ + @Scheduled(fixedDelay = 1000) + public void start() { + Long timeMillis = System.currentTimeMillis(); + if (lastWatchTime == null + || !OPTIONING.isEmpty() + || timeMillis - lastOptionTime <= OPTION_INTERVAL.toMillis() + || timeMillis - lastWatchTime >= WATCHING_INTERVAL.toMillis()) { + lastWatchTime = timeMillis; + WATCHING_APPS.forEach(this::watch); + } + } + + @VisibleForTesting + public @Nullable SparkAppStateEnum tryQuerySparkAppState(@Nonnull Long appId) { + SparkApplication app = WATCHING_APPS.get(appId); + return (app == null || app.getState() == null) ? null : app.getStateEnum(); + } + + private void watch(Long id, SparkApplication application) { + executorService.execute( + () -> { + try { + getStateFromYarn(application); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + } + + private StopFromEnum getAppStopFrom(Long appId) { + return STOP_FROM_MAP.getOrDefault(appId, StopFromEnum.NONE); + } + + /** + * Query the job state from yarn and query the resource usage from spark when job state is RUNNING + * + * @param application spark application + */ + private void getStateFromYarn(SparkApplication application) throws Exception { + SparkOptionStateEnum optionStateEnum = OPTIONING.get(application.getId()); + + // query the status from the yarn rest Api + YarnAppInfo yarnAppInfo = httpYarnAppInfo(application); + if (yarnAppInfo == null) { + throw new RuntimeException("[StreamPark][SparkAppHttpWatcher] getStateFromYarn failed!"); + } else { + try { + String state = yarnAppInfo.getApp().getState(); + SparkAppStateEnum sparkAppStateEnum = SparkAppStateEnum.of(state); + if (SparkAppStateEnum.OTHER == sparkAppStateEnum) { + return; + } + if (SparkAppStateEnum.isEndState(sparkAppStateEnum.getValue())) { + log.info( + "[StreamPark][SparkAppHttpWatcher] getStateFromYarn, app {} was ended, jobId is {}, state is {}", + application.getId(), + application.getJobId(), + sparkAppStateEnum); + application.setEndTime(new Date()); + } + if (SparkAppStateEnum.RUNNING == sparkAppStateEnum) { + Tuple3<Double, Double, Long> resourceStatus = getResourceStatus(application); + double memoryUsed = resourceStatus.t1; + double maxMemory = resourceStatus.t2; + double totalCores = resourceStatus.t3; + log.info( + "[StreamPark][SparkAppHttpWatcher] getStateFromYarn, app {} was running, jobId is {}, memoryUsed: {}MB, maxMemory: {}MB, totalCores: {}", + application.getId(), + application.getJobId(), + String.format("%.2f", memoryUsed), + String.format("%.2f", maxMemory), + totalCores); + // TODO: Modify the table structure to persist the results + } + application.setState(sparkAppStateEnum.getValue()); + cleanOptioning(optionStateEnum, application.getId()); + doPersistMetrics(application, false); + if (SparkAppStateEnum.FAILED == sparkAppStateEnum + || SparkAppStateEnum.LOST == sparkAppStateEnum + || applicationInfoService.checkAlter(application)) { + doAlert(application, sparkAppStateEnum); + if (SparkAppStateEnum.FAILED == sparkAppStateEnum) { + applicationActionService.start(application, true); + } + } + } catch (Exception e) { + throw new RuntimeException("[StreamPark][SparkAppHttpWatcher] getStateFromYarn failed!"); + } + } + } + + /** + * Calculate spark task progress from Spark rest api. (proxyed by yarn) Only available when yarn + * application status is RUNNING. + * + * @param application + * @return task progress + * @throws Exception + */ + private double getTasksProgress(SparkApplication application) throws Exception { + Job[] jobs = httpJobsStatus(application); Review Comment: `getTasksProgress` is not used -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
