This is an automated email from the ASF dual-hosted git repository.
zhouky pushed a commit to branch branch-0.3
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git
The following commit(s) were added to refs/heads/branch-0.3 by this push:
new 810317d6b [CELEBORN-975] Refactor the check logic to stop the celeborn
master and worker
810317d6b is described below
commit 810317d6be823aea1ebcaa2dea100e491e183511
Author: sychen <[email protected]>
AuthorDate: Mon Sep 18 16:23:32 2023 +0800
[CELEBORN-975] Refactor the check logic to stop the celeborn master and
worker
### What changes were proposed in this pull request?
`stop-master.sh` and `stop-worker.sh` support the stop command to wait up
to 600s after starting `kill -15`.
Delete the pid file only when the stop succeeds, to avoid failing to retry
the stop command to find the pid file.
### Why are the changes needed?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
Closes #1911 from cxzl25/CELEBORN-975.
Authored-by: sychen <[email protected]>
Signed-off-by: zky.zhoukeyong <[email protected]>
(cherry picked from commit 07c1dc2568e8a1dc57448919ed01bfa6eb11b630)
Signed-off-by: zky.zhoukeyong <[email protected]>
---
sbin/celeborn-daemon.sh | 72 ++++++++++++++++++++++++-------------------------
1 file changed, 36 insertions(+), 36 deletions(-)
diff --git a/sbin/celeborn-daemon.sh b/sbin/celeborn-daemon.sh
index fce71bb27..cd69dc333 100755
--- a/sbin/celeborn-daemon.sh
+++ b/sbin/celeborn-daemon.sh
@@ -157,74 +157,74 @@ run_command() {
}
-case $option in
-
- (start)
+start_celeborn() {
run_command class "$@"
- ;;
-
- (stop)
-
- if [ -f $pid ]; then
- TARGET_ID="$(cat "$pid")"
- if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]] || [[ $(ps -p
"$TARGET_ID" -o comm=) =~ "jboot" ]]; then
- echo "stopping $command"
- kill "$TARGET_ID" && rm -f "$pid"
- else
- echo "no $command to stop"
- fi
- else
- echo "no $command to stop"
- fi
- ;;
-
- (restart)
+}
+stop_celeborn() {
if [ -f $pid ]; then
TARGET_ID="$(cat "$pid")"
if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]] || [[ $(ps -p
"$TARGET_ID" -o comm=) =~ "jboot" ]]; then
echo "stopping $command"
- kill "$TARGET_ID" && rm -f "$pid"
+ kill "$TARGET_ID"
wait_time=0
# keep same with `celeborn.worker.graceful.shutdown.timeout`
wait_timeout=600
while [[ $(ps -p "$TARGET_ID" -o comm=) != "" && $wait_time -lt
$wait_timeout ]];
do
- sleep 1s
+ sleep 1
((wait_time++))
echo "waiting for worker graceful shutdown, wait for ${wait_time}s"
done
+
if [[ $(ps -p "$TARGET_ID" -o comm=) == "" ]]; then
- run_command class "$@"
+ rm -f "$pid"
else
- echo "stopping $command failed."
+ echo "Failed to stop server(pid=$TARGET_ID) after ${wait_timeout}s"
+ exit 1
fi
else
- rm -f "$pid"
- echo "no $command to stop, directly start"
- run_command class "$@"
+ echo "no $command to stop"
fi
else
- echo "no $command to stop, directly start"
- run_command class "$@"
+ echo "no $command to stop"
fi
- ;;
-
- (status)
+}
+check_celeborn(){
if [ -f $pid ]; then
TARGET_ID="$(cat "$pid")"
if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]] || [[ $(ps -p
"$TARGET_ID" -o comm=) =~ "jboot" ]]; then
- echo $command is running.
+ echo "$command is running."
exit 0
else
- echo $pid file is present but $command not running
+ echo "$pid file is present but $command not running"
exit 1
fi
else
- echo $command not running.
+ echo "$command not running."
exit 2
fi
+}
+
+case $option in
+
+ (start)
+ start_celeborn "$@"
+ ;;
+
+ (stop)
+ stop_celeborn
+ ;;
+
+ (restart)
+ echo "Restarting Celeborn"
+ stop_celeborn
+ start_celeborn "$@"
+ ;;
+
+ (status)
+ check_celeborn
;;
(*)