The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after ark-5.14 ------> commit 5216fd4e1597fe5990502fe8d717210e3aebf363 Author: Pavel Tikhomirov <ptikhomi...@virtuozzo.com> Date: Thu Sep 30 17:43:57 2021 +0300
fence-watchdog: Add fence-watchdog driver We need to forbid system to work without a special userspace daemon for purposes of HA cluster. So add this watchdog module, which will fence the node, if that daemon won't update timer value in the file /sys/kernel/watchdog_timer. The module is needed for pstorage, so we need to protect network from the broken node, so we can put check to net_rx_action. Signed-off-by: Dmitry Guryanov <dgurya...@parallels.com> Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com> Rebase: ktkhai@ Putting fence_wdog_jiffies64 in same cacheline with jiffies will be in a separate patch: "fence-watchdog: link fence_wdog_jiffies64 and jiffies in one cacheline" Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com> (cherry-picked from vz8 commit aef6d38b398b ("fence-watchdog: Add fence-watchdog driver")) Updated use of timekeeping API since 32-bit timespec is no longer available. Applied minor formatting fixes. Added "CONFIG_FENCE_WATCHDOG=y" to redhat/configs/custom-overrides/generic/CONFIG_FENCE_WATCHDOG Signed-off-by: Nikita Yushchenko <nikita.yushche...@virtuozzo.com> --- include/linux/fence-watchdog.h | 15 + kernel/Kconfig.openvz | 4 + kernel/Makefile | 1 + kernel/fence-watchdog.c | 313 +++++++++++++++++++++ net/core/dev.c | 13 + .../custom-overrides/generic/CONFIG_FENCE_WATCHDOG | 1 + 6 files changed, 347 insertions(+) diff --git a/include/linux/fence-watchdog.h b/include/linux/fence-watchdog.h new file mode 100644 index 000000000000..26b542a4080f --- /dev/null +++ b/include/linux/fence-watchdog.h @@ -0,0 +1,15 @@ +/* + * include/linux/fence-watchdog.h + * + * Copyright (c) 2010-2015 Parallels IP Holdings GmbH + * Copyright (c) 2017-2021 Virtuozzo International GmbH. All rights reserved. + * + */ + +#ifndef _LINUX_FENCE_WATCHDOG_H_ +#define _LINUX_FENCE_WATCHDOG_H_ + +inline int fence_wdog_check_timer(void); +bool fence_wdog_tmo_match(void); + +#endif diff --git a/kernel/Kconfig.openvz b/kernel/Kconfig.openvz index 6c3fbed8ae60..9489342596ab 100644 --- a/kernel/Kconfig.openvz +++ b/kernel/Kconfig.openvz @@ -60,4 +60,8 @@ config VZ_EVENT networking code does. By now just the notifications of the VE essensial status changes are being sent. +config FENCE_WATCHDOG + bool "Fencing watchdog for HA cluster support" + depends on X86_64 + default n endmenu diff --git a/kernel/Makefile b/kernel/Makefile index bf938a777629..6f59a21caa5b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -126,6 +126,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o +obj-$(CONFIG_FENCE_WATCHDOG) += fence-watchdog.o obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_RSEQ) += rseq.o diff --git a/kernel/fence-watchdog.c b/kernel/fence-watchdog.c new file mode 100644 index 000000000000..e7fe7d2f3804 --- /dev/null +++ b/kernel/fence-watchdog.c @@ -0,0 +1,313 @@ +/* + * kernel/fence-watchdog.c + * + * Copyright (c) 2010-2015 Parallels IP Holdings GmbH + * Copyright (c) 2017-2021 Virtuozzo International GmbH. All rights reserved. + * + */ + +/* + * Provide userspace with an interface to forbid kernel to work + * without an userspace daemon. + * + * The daemon should write number of seconds before fencing to the + * file /sys/kernel/watchdog_timer, and must renew it, until the + * time elapses. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/kobject.h> +#include <linux/jiffies.h> +#include <linux/reboot.h> +#include <linux/fence-watchdog.h> +#include <linux/device.h> +#include <linux/kmsg_dump.h> +#include <linux/fs.h> +#include <linux/string.h> + +#define MAX_U64 (~(u64)0) +#define MAX_JIFFIES_DELTA (10 * 365UL * 24UL * 3600UL * HZ) +#define ACTION_NAME_LEN 16 + +enum { + FENCE_WDOG_CRASH = 0, + FENCE_WDOG_REBOOT = 1, + FENCE_WDOG_POWEROFF = 2, + FENCE_WDOG_NETFILTER = 3, +}; + +const char *action_names[] = {"crash", "reboot", "halt", "netfilter", NULL}; + +unsigned long volatile fence_wdog_jiffies64 = MAX_U64; +static int fence_wdog_action = FENCE_WDOG_CRASH; + +enum { + NOT_FENCED = 0, + FENCED = 1, + FENCED_TIMEOUT = 2, +}; + +static atomic_t fence_stage = ATOMIC_INIT(NOT_FENCED); +static char fence_wdog_log_path[PATH_MAX] = "/fence_wdog.log"; + +#define SECS_PER_MIN 60 +#define PREFIX_LEN 39 + +static int print_prefix(char *msg) { + struct timespec64 ts; + struct tm tm; + + ktime_get_real_ts64(&ts); + time64_to_tm(ts.tv_sec - sys_tz.tz_minuteswest * SECS_PER_MIN, 0, &tm); + + return snprintf(msg, PREFIX_LEN, "[%02d:%02d:%02d/%04ld-%02d-%02d] fence-watchdog: ", + tm.tm_hour, tm.tm_min, tm.tm_sec, + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday); +} + +#define MSG_LEN (PREFIX_LEN + 10) + +void fence_wdog_log(void) +{ + char msg[MSG_LEN]; + struct file *file; + int ret, len; + + ret = print_prefix(msg); + if (ret < 0) + return; + + len = strlen(msg); + + ret = snprintf(msg + len, MSG_LEN - len, "%s\n", action_names[fence_wdog_action]); + if (ret != strlen(action_names[fence_wdog_action]) + 1) { + printk(KERN_EMERG "fence-watchdog: Failed to sprintf msg\n"); + return; + } + + file = filp_open(fence_wdog_log_path, + O_CREAT | O_WRONLY | O_APPEND | O_NOFOLLOW | O_LARGEFILE, + 0600); + if (IS_ERR(file)) { + printk(KERN_EMERG "fence-watchdog: Failed to open log path\n"); + return; + } + + if (!S_ISREG(file_inode(file)->i_mode)) { + printk(KERN_EMERG "fence-watchdog: Wrong type of log file\n"); + goto close; + } + + ret = kernel_write(file, msg, strlen(msg), &file->f_pos); + if (ret < 0) { + printk(KERN_EMERG "fence-watchdog: Failed to write msg, ret=%d\n", ret); + goto close; + } + + ret = vfs_fsync(file, 0); + if (ret < 0) + printk(KERN_EMERG "fence-watchdog: Failed to fsync log file ret=%d\n", ret); + +close: + ret = filp_close(file, NULL); + if (ret < 0) + printk(KERN_EMERG "fence-watchdog: Failed to close log file ret=%d\n", ret); + + return; +} + +static void do_halt_or_reboot(struct work_struct *dummy) +{ + printk(KERN_EMERG "fence-watchdog: %s\n", + action_names[fence_wdog_action]); + + fence_wdog_log(); + + switch (fence_wdog_action) { + case FENCE_WDOG_REBOOT: + emergency_restart(); + break; + case FENCE_WDOG_POWEROFF: + kernel_halt(); + break; + } +} + +static DECLARE_WORK(halt_or_reboot_work, do_halt_or_reboot); + +void fence_wdog_do_fence(void) +{ + if (fence_wdog_action == FENCE_WDOG_CRASH || + atomic_read(&fence_stage) == FENCED_TIMEOUT) + panic("fence-watchdog: %s\n", + action_names[fence_wdog_action]); + else + schedule_work(&halt_or_reboot_work); +} + +#define FENCE_WDOG_TIMEOUT 30 + +inline int fence_wdog_check_timer(void) +{ + if (unlikely(get_jiffies_64() > fence_wdog_jiffies64 && + fence_wdog_action != FENCE_WDOG_NETFILTER)) { + if (atomic_cmpxchg(&fence_stage, NOT_FENCED, FENCED) == NOT_FENCED + || (get_jiffies_64() > fence_wdog_jiffies64 + + FENCE_WDOG_TIMEOUT * HZ + && atomic_cmpxchg(&fence_stage, FENCED, FENCED_TIMEOUT) == FENCED)) + fence_wdog_do_fence(); + + return 1; + } + + return 0; +} + +bool fence_wdog_tmo_match(void) +{ + return get_jiffies_64() > fence_wdog_jiffies64; +} +EXPORT_SYMBOL(fence_wdog_tmo_match); + +static ssize_t fence_wdog_timer_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + ssize_t ret; + u64 jiffies_delta = fence_wdog_jiffies64 - get_jiffies_64(); + struct timespec64 t; + + if (jiffies_delta > MAX_JIFFIES_DELTA) { + ret = sprintf(buf, "inf\n"); + } else { + jiffies_to_timespec64(jiffies_delta, &t); + ret = sprintf(buf, "%lld\n", t.tv_sec); + } + + return ret; +} + +static ssize_t fence_wdog_timer_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + unsigned long long val; + unsigned long jiffies_delta; + struct timespec64 t; + + if (kstrtoull(buf, 10, &val)) + return -EINVAL; + + if (val == 0) { + fence_wdog_jiffies64 = MAX_U64; + return count; + } + + t.tv_sec = val; + t.tv_nsec = 0; + + jiffies_delta = timespec64_to_jiffies(&t); + if (jiffies_delta > MAX_JIFFIES_DELTA) + return -EINVAL; + + fence_wdog_jiffies64 = get_jiffies_64() + jiffies_delta; + + return count; +} + +static ssize_t fence_wdog_action_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", action_names[fence_wdog_action]); +} + +static ssize_t fence_wdog_action_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + char str_action[ACTION_NAME_LEN]; + int i = 0; + + if (sscanf(buf, "%15s", str_action) != 1) + return -EINVAL; + + for (i = 0; action_names[i]; i++) { + if ((!strncasecmp(str_action, action_names[i], ACTION_NAME_LEN))) { + fence_wdog_action = i; + return count; + } + } + + return -EINVAL; +} + +static ssize_t fence_wdog_available_actions_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i, ret = 0; + + for (i = 0; action_names[i] != NULL; i++) + ret += sprintf(&buf[ret], "%s ", action_names[i]); + + ret += sprintf(&buf[ret], "\n"); + return ret; +} + +static ssize_t fence_wdog_log_path_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", fence_wdog_log_path); +} + +#define STORE_FORMAT_LEN 16 + +static ssize_t fence_wdog_log_path_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + char format[STORE_FORMAT_LEN]; + int ret; + + ret = snprintf(format, STORE_FORMAT_LEN, "%%%ds", PATH_MAX - 1); + if (ret < 0) + return ret; + + + if (sscanf(buf, format, fence_wdog_log_path) != 1) + return -EINVAL; + return 0; +} + +static struct kobj_attribute fence_wdog_timer_attr = + __ATTR(watchdog_timer, 0644, + fence_wdog_timer_show, fence_wdog_timer_store); + +static struct kobj_attribute fence_wdog_action_attr = + __ATTR(watchdog_action, 0644, + fence_wdog_action_show, fence_wdog_action_store); + +static struct kobj_attribute fence_wdog_available_actions_attr = + __ATTR(watchdog_available_actions, 0644, + fence_wdog_available_actions_show, NULL); + +static struct kobj_attribute fence_wdog_log_path_attr = + __ATTR(watchdog_log_path, 0644, + fence_wdog_log_path_show, fence_wdog_log_path_store); + +static struct attribute *fence_wdog_attrs[] = { + &fence_wdog_timer_attr.attr, + &fence_wdog_action_attr.attr, + &fence_wdog_available_actions_attr.attr, + &fence_wdog_log_path_attr.attr, + NULL, +}; + +static struct attribute_group fence_wdog_attr_group = { + .attrs = fence_wdog_attrs, +}; + +static int __init fence_wdog_init(void) +{ + sysfs_update_group(kernel_kobj, &fence_wdog_attr_group); + return 0; +} + +module_init(fence_wdog_init) diff --git a/net/core/dev.c b/net/core/dev.c index 3500c9544d27..21b0e5ff5eaf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -151,6 +151,7 @@ #include <linux/prandom.h> #include <linux/once_lite.h> #include <linux/ve.h> +#include <linux/fence-watchdog.h> #include "net-sysfs.h" @@ -3669,6 +3670,14 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de struct sk_buff *skb = first; int rc = NETDEV_TX_OK; +#ifdef CONFIG_FENCE_WATCHDOG + if (unlikely(fence_wdog_check_timer())) { + kfree_skb(skb); + *ret = rc; + return NULL; + } +#endif + while (skb) { struct sk_buff *next = skb->next; @@ -7189,6 +7198,10 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) list_splice_init(&sd->poll_list, &list); local_irq_enable(); +#ifdef CONFIG_FENCE_WATCHDOG + fence_wdog_check_timer(); +#endif + for (;;) { struct napi_struct *n; diff --git a/redhat/configs/custom-overrides/generic/CONFIG_FENCE_WATCHDOG b/redhat/configs/custom-overrides/generic/CONFIG_FENCE_WATCHDOG new file mode 100644 index 000000000000..434aac2b336a --- /dev/null +++ b/redhat/configs/custom-overrides/generic/CONFIG_FENCE_WATCHDOG @@ -0,0 +1 @@ +CONFIG_FENCE_WATCHDOG=y _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel