[PATCH] net/mlx5: fix kfree mismatch in indir_table.c
Memory allocated by kvzalloc() should be freed by kvfree(). Fixes: 34ca65352ddf2 ("net/mlx5: E-Switch, Indirect table infrastructur") Signed-off-by: Xiaoming Ni --- .../net/ethernet/mellanox/mlx5/core/esw/indir_table.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c index 6f6772bf61a2..3da7becc1069 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c @@ -248,7 +248,7 @@ static int mlx5_esw_indir_table_rule_get(struct mlx5_eswitch *esw, err_ethertype: kfree(rule); out: - kfree(rule_spec); + kvfree(rule_spec); return err; } @@ -328,7 +328,7 @@ static int mlx5_create_indir_recirc_group(struct mlx5_eswitch *esw, e->recirc_cnt = 0; out: - kfree(in); + kvfree(in); return err; } @@ -347,7 +347,7 @@ static int mlx5_create_indir_fwd_group(struct mlx5_eswitch *esw, spec = kvzalloc(sizeof(*spec), GFP_KERNEL); if (!spec) { - kfree(in); + kvfree(in); return -ENOMEM; } @@ -371,8 +371,8 @@ static int mlx5_create_indir_fwd_group(struct mlx5_eswitch *esw, } err_out: - kfree(spec); - kfree(in); + kvfree(spec); + kvfree(in); return err; } -- 2.27.0
[PATCH resend 3/4] nfc: fix memory leak in llcp_sock_connect()
In llcp_sock_connect(), use kmemdup to allocate memory for "llcp_sock->service_name". The memory is not released in the sock_unlink label of the subsequent failure branch. As a result, memory leakage occurs. fix CVE-2020-25672 Fixes: d646960f7986 ("NFC: Initial LLCP support") Reported-by: "kiyin(尹亮)" Link: https://www.openwall.com/lists/oss-security/2020/11/01/1 Cc: #v3.3 Signed-off-by: Xiaoming Ni --- net/nfc/llcp_sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 9e2799ee1595..59172614b249 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -746,6 +746,8 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, sock_unlink: nfc_llcp_sock_unlink(>connecting_sockets, sk); + kfree(llcp_sock->service_name); + llcp_sock->service_name = NULL; sock_llcp_release: nfc_llcp_put_ssap(local, llcp_sock->ssap); -- 2.27.0
[PATCH resend 0/4] nfc: fix Resource leakage and endless loop
fix Resource leakage and endless loop in net/nfc/llcp_sock.c, reported by "kiyin(尹亮)". Link: https://www.openwall.com/lists/oss-security/2020/11/01/1 Xiaoming Ni (4): nfc: fix refcount leak in llcp_sock_bind() nfc: fix refcount leak in llcp_sock_connect() nfc: fix memory leak in llcp_sock_connect() nfc: Avoid endless loops caused by repeated llcp_sock_connect() net/nfc/llcp_sock.c | 10 ++ 1 file changed, 10 insertions(+) -- 2.27.0
[PATCH resend 1/4] nfc: fix refcount leak in llcp_sock_bind()
nfc_llcp_local_get() is invoked in llcp_sock_bind(), but nfc_llcp_local_put() is not invoked in subsequent failure branches. As a result, refcount leakage occurs. To fix it, add calling nfc_llcp_local_put(). fix CVE-2020-25670 Fixes: c7aa12252f51 ("NFC: Take a reference on the LLCP local pointer when creating a socket") Reported-by: "kiyin(尹亮)" Link: https://www.openwall.com/lists/oss-security/2020/11/01/1 Cc: #v3.6 Signed-off-by: Xiaoming Ni --- net/nfc/llcp_sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index d257ed3b732a..68832ee4b9f8 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -108,11 +108,13 @@ static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) llcp_sock->service_name_len, GFP_KERNEL); if (!llcp_sock->service_name) { + nfc_llcp_local_put(llcp_sock->local); ret = -ENOMEM; goto put_dev; } llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock); if (llcp_sock->ssap == LLCP_SAP_MAX) { + nfc_llcp_local_put(llcp_sock->local); kfree(llcp_sock->service_name); llcp_sock->service_name = NULL; ret = -EADDRINUSE; -- 2.27.0
[PATCH resend 2/4] nfc: fix refcount leak in llcp_sock_connect()
nfc_llcp_local_get() is invoked in llcp_sock_connect(), but nfc_llcp_local_put() is not invoked in subsequent failure branches. As a result, refcount leakage occurs. To fix it, add calling nfc_llcp_local_put(). fix CVE-2020-25671 Fixes: c7aa12252f51 ("NFC: Take a reference on the LLCP local pointer when creating a socket") Reported-by: "kiyin(尹亮)" Link: https://www.openwall.com/lists/oss-security/2020/11/01/1 Cc: #v3.6 Signed-off-by: Xiaoming Ni --- net/nfc/llcp_sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 68832ee4b9f8..9e2799ee1595 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -704,6 +704,7 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, llcp_sock->local = nfc_llcp_local_get(local); llcp_sock->ssap = nfc_llcp_get_local_ssap(local); if (llcp_sock->ssap == LLCP_SAP_MAX) { + nfc_llcp_local_put(llcp_sock->local); ret = -ENOMEM; goto put_dev; } @@ -748,6 +749,7 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, sock_llcp_release: nfc_llcp_put_ssap(local, llcp_sock->ssap); + nfc_llcp_local_put(llcp_sock->local); put_dev: nfc_put_device(dev); -- 2.27.0
[PATCH resend 4/4] nfc: Avoid endless loops caused by repeated llcp_sock_connect()
When sock_wait_state() returns -EINPROGRESS, "sk->sk_state" is LLCP_CONNECTING. In this case, llcp_sock_connect() is repeatedly invoked, nfc_llcp_sock_link() will add sk to local->connecting_sockets twice. sk->sk_node->next will point to itself, that will make an endless loop and hang-up the system. To fix it, check whether sk->sk_state is LLCP_CONNECTING in llcp_sock_connect() to avoid repeated invoking. Fixes: b4011239a08e ("NFC: llcp: Fix non blocking sockets connections") Reported-by: "kiyin(尹亮)" Link: https://www.openwall.com/lists/oss-security/2020/11/01/1 Cc: #v3.11 Signed-off-by: Xiaoming Ni --- net/nfc/llcp_sock.c | 4 1 file changed, 4 insertions(+) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 59172614b249..a3b46f03 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -673,6 +673,10 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, ret = -EISCONN; goto error; } + if (sk->sk_state == LLCP_CONNECTING) { + ret = -EINPROGRESS; + goto error; + } dev = nfc_get_device(addr->dev_idx); if (dev == NULL) { -- 2.27.0
Re: [PATCH 4/4] nfc: Avoid endless loops caused by repeated llcp_sock_connect()(Internet mail)
On 2021/3/3 17:28, kiyin(尹亮) wrote: Hi xiaoming, the path can only fix the endless loop problem. it can't fix the meaningless llcp_sock->service_name problem. if we set llcp_sock->service_name to meaningless string, the connect will be failed. and sk->sk_state will not be LLCP_CONNECTED. then we can call llcp_sock_connect() many times. that leaks everything: llcp_sock->dev, llcp_sock->local, llcp_sock->ssap, llcp_sock->service_name... I didn't find the code to modify sk->sk_state after a connect failure. Can you provide guidance? Based on my understanding of the current code: After llcp_sock_connect() is invoked using the meaningless service_name as the parameter, sk->sk_state is set to LLCP_CONNECTING. After that, no corresponding service responds to the request because the service_name is meaningless, the value of sk->sk_state remains unchanged. Therefore, when llcp_sock_connect() is invoked again, resources such as llcp_sock->service_name are not repeatedly applied because sk_state is set to LLCP_CONNECTING. In this way, the repeated invoking of llcp_sock_connect() does not repeatedly leak resources. Thanks Xiaoming Ni -Original Message- From: Xiaoming Ni [mailto:nixiaom...@huawei.com] Sent: Wednesday, March 3, 2021 2:17 PM To: linux-kernel@vger.kernel.org; kiyin(尹亮) ; sta...@vger.kernel.org; gre...@linuxfoundation.org; sa...@linux.intel.com; linvi...@tuxdriver.com; da...@davemloft.net; k...@kernel.org; m...@pengutronix.de; ste...@datenfreihafen.org; matthieu.bae...@tessares.net; net...@vger.kernel.org Cc: nixiaom...@huawei.com; wang...@huawei.com; xiaoqi...@huawei.com Subject: [PATCH 4/4] nfc: Avoid endless loops caused by repeated llcp_sock_connect()(Internet mail) When sock_wait_state() returns -EINPROGRESS, "sk->sk_state" is LLCP_CONNECTING. In this case, llcp_sock_connect() is repeatedly invoked, nfc_llcp_sock_link() will add sk to local->connecting_sockets twice. sk->sk_node->next will point to itself, that will make an endless loop and hang-up the system. To fix it, check whether sk->sk_state is LLCP_CONNECTING in llcp_sock_connect() to avoid repeated invoking. fix CVE-2020-25673 Fixes: b4011239a08e ("NFC: llcp: Fix non blocking sockets connections") Reported-by: "kiyin(尹亮)" Link: https://www.openwall.com/lists/oss-security/2020/11/01/1 Cc: #v3.11 Signed-off-by: Xiaoming Ni --- net/nfc/llcp_sock.c | 4 1 file changed, 4 insertions(+) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 59172614b249..a3b46f03 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -673,6 +673,10 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, ret = -EISCONN; goto error; } + if (sk->sk_state == LLCP_CONNECTING) { + ret = -EINPROGRESS; + goto error; + } dev = nfc_get_device(addr->dev_idx); if (dev == NULL) { -- 2.27.0
[PATCH 1/4] nfc: fix refcount leak in llcp_sock_bind()
nfc_llcp_local_get() is invoked in llcp_sock_bind(), but nfc_llcp_local_put() is not invoked in subsequent failure branches. As a result, refcount leakage occurs. To fix it, add calling nfc_llcp_local_put(). fix CVE-2020-25670 Fixes: c7aa12252f51 ("NFC: Take a reference on the LLCP local pointer when creating a socket") Reported-by: "kiyin(尹亮)" Link: https://www.openwall.com/lists/oss-security/2020/11/01/1 Cc: #v3.6 Signed-off-by: Xiaoming Ni --- net/nfc/llcp_sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index d257ed3b732a..68832ee4b9f8 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -108,11 +108,13 @@ static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) llcp_sock->service_name_len, GFP_KERNEL); if (!llcp_sock->service_name) { + nfc_llcp_local_put(llcp_sock->local); ret = -ENOMEM; goto put_dev; } llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock); if (llcp_sock->ssap == LLCP_SAP_MAX) { + nfc_llcp_local_put(llcp_sock->local); kfree(llcp_sock->service_name); llcp_sock->service_name = NULL; ret = -EADDRINUSE; -- 2.27.0
[PATCH 3/4] nfc: fix memory leak in llcp_sock_connect()
In llcp_sock_connect(), use kmemdup to allocate memory for "llcp_sock->service_name". The memory is not released in the sock_unlink label of the subsequent failure branch. As a result, memory leakage occurs. fix CVE-2020-25672 Fixes: d646960f7986 ("NFC: Initial LLCP support") Reported-by: "kiyin(尹亮)" Link: https://www.openwall.com/lists/oss-security/2020/11/01/1 Cc: #v3.3 Signed-off-by: Xiaoming Ni --- net/nfc/llcp_sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 9e2799ee1595..59172614b249 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -746,6 +746,8 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, sock_unlink: nfc_llcp_sock_unlink(>connecting_sockets, sk); + kfree(llcp_sock->service_name); + llcp_sock->service_name = NULL; sock_llcp_release: nfc_llcp_put_ssap(local, llcp_sock->ssap); -- 2.27.0
[PATCH 0/4] nfc: fix Resource leakage and endless loop
fix Resource leakage and endless loop in net/nfc/llcp_sock.c, reported by "kiyin(尹亮)". Link: https://www.openwall.com/lists/oss-security/2020/11/01/1 Xiaoming Ni (4): nfc: fix refcount leak in llcp_sock_bind() nfc: fix refcount leak in llcp_sock_connect() nfc: fix memory leak in llcp_sock_connect() nfc: Avoid endless loops caused by repeated llcp_sock_connect() net/nfc/llcp_sock.c | 10 ++ 1 file changed, 10 insertions(+) -- 2.27.0
[PATCH 2/4] nfc: fix refcount leak in llcp_sock_connect()
nfc_llcp_local_get() is invoked in llcp_sock_connect(), but nfc_llcp_local_put() is not invoked in subsequent failure branches. As a result, refcount leakage occurs. To fix it, add calling nfc_llcp_local_put(). fix CVE-2020-25671 Fixes: c7aa12252f51 ("NFC: Take a reference on the LLCP local pointer when creating a socket") Reported-by: "kiyin(尹亮)" Link: https://www.openwall.com/lists/oss-security/2020/11/01/1 Cc: #v3.6 Signed-off-by: Xiaoming Ni --- net/nfc/llcp_sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 68832ee4b9f8..9e2799ee1595 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -704,6 +704,7 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, llcp_sock->local = nfc_llcp_local_get(local); llcp_sock->ssap = nfc_llcp_get_local_ssap(local); if (llcp_sock->ssap == LLCP_SAP_MAX) { + nfc_llcp_local_put(llcp_sock->local); ret = -ENOMEM; goto put_dev; } @@ -748,6 +749,7 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, sock_llcp_release: nfc_llcp_put_ssap(local, llcp_sock->ssap); + nfc_llcp_local_put(llcp_sock->local); put_dev: nfc_put_device(dev); -- 2.27.0
[PATCH 4/4] nfc: Avoid endless loops caused by repeated llcp_sock_connect()
When sock_wait_state() returns -EINPROGRESS, "sk->sk_state" is LLCP_CONNECTING. In this case, llcp_sock_connect() is repeatedly invoked, nfc_llcp_sock_link() will add sk to local->connecting_sockets twice. sk->sk_node->next will point to itself, that will make an endless loop and hang-up the system. To fix it, check whether sk->sk_state is LLCP_CONNECTING in llcp_sock_connect() to avoid repeated invoking. fix CVE-2020-25673 Fixes: b4011239a08e ("NFC: llcp: Fix non blocking sockets connections") Reported-by: "kiyin(尹亮)" Link: https://www.openwall.com/lists/oss-security/2020/11/01/1 Cc: #v3.11 Signed-off-by: Xiaoming Ni --- net/nfc/llcp_sock.c | 4 1 file changed, 4 insertions(+) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 59172614b249..a3b46f03 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -673,6 +673,10 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, ret = -EISCONN; goto error; } + if (sk->sk_state == LLCP_CONNECTING) { + ret = -EINPROGRESS; + goto error; + } dev = nfc_get_device(addr->dev_idx); if (dev == NULL) { -- 2.27.0
Re: [PATCH] futex: fix dead code in attach_to_pi_owner()
On 2021/2/25 16:25, Greg KH wrote: On Mon, Feb 22, 2021 at 08:53:52PM +0800, Xiaoming Ni wrote: From: Thomas Gleixner The handle_exit_race() function is defined in commit c158b461306df82 ("futex: Cure exit race"), which never returns -EBUSY. This results in a small piece of dead code in the attach_to_pi_owner() function: int ret = handle_exit_race(uaddr, uval, p); /* Never return -EBUSY */ ... if (ret == -EBUSY) *exiting = p; /* dead code */ The return value -EBUSY is added to handle_exit_race() in upsteam commit ac31c7ff8624409 ("futex: Provide distinct return value when owner is exiting"). This commit was incorporated into v4.9.255, before the function handle_exit_race() was introduced, whitout Modify handle_exit_race(). To fix dead code, extract the change of handle_exit_race() from commit ac31c7ff8624409 ("futex: Provide distinct return value when owner is exiting"), re-incorporated. mainline: ac31c7ff8624 futex: Provide distinct return value when owner is exiting Fixes: c158b461306df82 ("futex: Cure exit race") stable linux-4.9.y 9c3f39860367 futex: Cure exit race c27f392040e2 futex: Provide distinct return value when owner is exiting Cc: sta...@vger.kernel.org # 4.9.258-rc1 Signed-off-by: Xiaoming Ni --- kernel/futex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) What is the git commit id of this patch in Linus's tree? Also, what kernel tree(s) is this supposed to go to? thanks, greg k-h . Sorry, the commit id c158b461306df82 in the patch does not exist in the linux-stable repository. The commit ID is from linux-stable-rc. I corrected the commit id in a subsequent email, and added a branch label. https://lore.kernel.org/lkml/20210224100923.51315-1-nixiaom...@huawei.com/ Sorry, I forgot to use "--in-reply-to=" when I sent the update patch. This issue occurs only in the linux-4.9.y branch v4.9.258 Thanks xiaoming Ni
Re: [PATCH stable-rc queue/4.9 1/1] futex: Provide distinct return value when owner is exiting
On 2021/2/24 15:47, Greg KH wrote: On Wed, Feb 24, 2021 at 09:41:01AM +0800, Xiaoming Ni wrote: On 2021/2/23 21:00, Greg KH wrote: On Mon, Feb 22, 2021 at 10:11:37PM +0800, Xiaoming Ni wrote: On 2021/2/22 20:09, Greg KH wrote: On Mon, Feb 22, 2021 at 06:54:06PM +0800, Xiaoming Ni wrote: On 2021/2/22 18:16, Greg KH wrote: On Mon, Feb 22, 2021 at 03:03:28PM +0800, Xiaoming Ni wrote: From: Thomas Gleixner commit ac31c7ff8624409ba3c4901df9237a616c187a5d upstream. This commit is already in the 4.9 tree. If the backport was incorrect, say that here, and describe what went wrong and why this commit fixes it. Also state what commit this fixes as well, otherwise this changelog just looks like it is being applied again to the tree, which doesn't make much sense. thanks, greg k-h . I wrote a cover for it. but forgot to adjust the title of the cover: https://lore.kernel.org/lkml/20210222070328.102384-1-nixiaom...@huawei.com/ I found a dead code in the queue/4.9 branch of the stable-rc repository. 2021-02-03: commit c27f392040e2f6 ("futex: Provide distinct return value when owner is exiting") The function handle_exit_race does not exist. Therefore, the change in handle_exit_race() is ignored in the patch round. 2021-02-22: commit e55cb811e612 ("futex: Cure exit race") Define the handle_exit_race() function, but no branch in the function returns EBUSY. As a result, dead code occurs in the attach_to_pi_owner(): int ret = handle_exit_race(uaddr, uval, p); ... if (ret == -EBUSY) *exiting = p; /* dead code */ To fix the dead code, modify the commit e55cb811e612 ("futex: Cure exit race"), or install a patch to incorporate the changes in handle_exit_race(). I am unfamiliar with the processing of the stable-rc queue branch, and I cannot find the patch mail of the current branch in https://lore.kernel.org/lkml/?q=%22futex%3A+Cure+exit+race%22 Therefore, I re-integrated commit ac31c7ff8624 ("futex: Provide distinct return value when owner is exiting"). And wrote a cover (but forgot to adjust the title of the cover): https://lore.kernel.org/lkml/20210222070328.102384-1-nixiaom...@huawei.com/ So this is a "fixup" patch, right? Please clearly label it as such in your patch description and resend this as what is here I can not apply at all. thanks, greg k-h . Thank you for your guidance. I have updated the patch description and resent the patch based on v4.9.258-rc1 https://lore.kernel.org/lkml/20210222125352.110124-1-nixiaom...@huawei.com/ Can you please try 4.9.258 and let me know if this is still needed or not? thanks, greg k-h . The dead code problem still exists in V4.9.258. No conflict occurs during my patch integration. Do I need to correct the version number marked in the cc table in the patch and resend the patch? Please do. thanks, greg k-h . I have resend the patch based on v4.9.258. link: https://lore.kernel.org/lkml/20210224100923.51315-1-nixiaom...@huawei.com/ Thanks Xiaoming Ni
[PATCH 4.9.258] futex: fix dead code in attach_to_pi_owner()
The handle_exit_race() function is defined in commit 9c3f39860367 ("futex: Cure exit race"), which never returns -EBUSY. This results in a small piece of dead code in the attach_to_pi_owner() function: int ret = handle_exit_race(uaddr, uval, p); /* Never return -EBUSY */ ... if (ret == -EBUSY) *exiting = p; /* dead code */ The return value -EBUSY is added to handle_exit_race() in upsteam commit ac31c7ff8624409 ("futex: Provide distinct return value when owner is exiting"). This commit was incorporated into v4.9.255, before the function handle_exit_race() was introduced, whitout Modify handle_exit_race(). To fix dead code, extract the change of handle_exit_race() from commit ac31c7ff8624409 ("futex: Provide distinct return value when owner is exiting"), re-incorporated. Fixes: 9c3f39860367 ("futex: Cure exit race") Cc: sta...@vger.kernel.org # v4.9.258 Signed-off-by: Xiaoming Ni --- kernel/futex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index b65dbb5d60bb..0fd785410150 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1207,11 +1207,11 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, u32 uval2; /* -* If the futex exit state is not yet FUTEX_STATE_DEAD, wait -* for it to finish. +* If the futex exit state is not yet FUTEX_STATE_DEAD, tell the +* caller that the alleged owner is busy. */ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) - return -EAGAIN; + return -EBUSY; /* * Reread the user space value to handle the following situation: -- 2.27.0
Re: [PATCH stable-rc queue/4.9 1/1] futex: Provide distinct return value when owner is exiting
On 2021/2/23 21:00, Greg KH wrote: On Mon, Feb 22, 2021 at 10:11:37PM +0800, Xiaoming Ni wrote: On 2021/2/22 20:09, Greg KH wrote: On Mon, Feb 22, 2021 at 06:54:06PM +0800, Xiaoming Ni wrote: On 2021/2/22 18:16, Greg KH wrote: On Mon, Feb 22, 2021 at 03:03:28PM +0800, Xiaoming Ni wrote: From: Thomas Gleixner commit ac31c7ff8624409ba3c4901df9237a616c187a5d upstream. This commit is already in the 4.9 tree. If the backport was incorrect, say that here, and describe what went wrong and why this commit fixes it. Also state what commit this fixes as well, otherwise this changelog just looks like it is being applied again to the tree, which doesn't make much sense. thanks, greg k-h . I wrote a cover for it. but forgot to adjust the title of the cover: https://lore.kernel.org/lkml/20210222070328.102384-1-nixiaom...@huawei.com/ I found a dead code in the queue/4.9 branch of the stable-rc repository. 2021-02-03: commit c27f392040e2f6 ("futex: Provide distinct return value when owner is exiting") The function handle_exit_race does not exist. Therefore, the change in handle_exit_race() is ignored in the patch round. 2021-02-22: commit e55cb811e612 ("futex: Cure exit race") Define the handle_exit_race() function, but no branch in the function returns EBUSY. As a result, dead code occurs in the attach_to_pi_owner(): int ret = handle_exit_race(uaddr, uval, p); ... if (ret == -EBUSY) *exiting = p; /* dead code */ To fix the dead code, modify the commit e55cb811e612 ("futex: Cure exit race"), or install a patch to incorporate the changes in handle_exit_race(). I am unfamiliar with the processing of the stable-rc queue branch, and I cannot find the patch mail of the current branch in https://lore.kernel.org/lkml/?q=%22futex%3A+Cure+exit+race%22 Therefore, I re-integrated commit ac31c7ff8624 ("futex: Provide distinct return value when owner is exiting"). And wrote a cover (but forgot to adjust the title of the cover): https://lore.kernel.org/lkml/20210222070328.102384-1-nixiaom...@huawei.com/ So this is a "fixup" patch, right? Please clearly label it as such in your patch description and resend this as what is here I can not apply at all. thanks, greg k-h . Thank you for your guidance. I have updated the patch description and resent the patch based on v4.9.258-rc1 https://lore.kernel.org/lkml/20210222125352.110124-1-nixiaom...@huawei.com/ Can you please try 4.9.258 and let me know if this is still needed or not? thanks, greg k-h . The dead code problem still exists in V4.9.258. No conflict occurs during my patch integration. Do I need to correct the version number marked in the cc table in the patch and resend the patch? Thanks Xiaoming Ni
Re: [PATCH stable-rc queue/4.9 1/1] futex: Provide distinct return value when owner is exiting
On 2021/2/22 20:09, Greg KH wrote: On Mon, Feb 22, 2021 at 06:54:06PM +0800, Xiaoming Ni wrote: On 2021/2/22 18:16, Greg KH wrote: On Mon, Feb 22, 2021 at 03:03:28PM +0800, Xiaoming Ni wrote: From: Thomas Gleixner commit ac31c7ff8624409ba3c4901df9237a616c187a5d upstream. This commit is already in the 4.9 tree. If the backport was incorrect, say that here, and describe what went wrong and why this commit fixes it. Also state what commit this fixes as well, otherwise this changelog just looks like it is being applied again to the tree, which doesn't make much sense. thanks, greg k-h . I wrote a cover for it. but forgot to adjust the title of the cover: https://lore.kernel.org/lkml/20210222070328.102384-1-nixiaom...@huawei.com/ I found a dead code in the queue/4.9 branch of the stable-rc repository. 2021-02-03: commit c27f392040e2f6 ("futex: Provide distinct return value when owner is exiting") The function handle_exit_race does not exist. Therefore, the change in handle_exit_race() is ignored in the patch round. 2021-02-22: commit e55cb811e612 ("futex: Cure exit race") Define the handle_exit_race() function, but no branch in the function returns EBUSY. As a result, dead code occurs in the attach_to_pi_owner(): int ret = handle_exit_race(uaddr, uval, p); ... if (ret == -EBUSY) *exiting = p; /* dead code */ To fix the dead code, modify the commit e55cb811e612 ("futex: Cure exit race"), or install a patch to incorporate the changes in handle_exit_race(). I am unfamiliar with the processing of the stable-rc queue branch, and I cannot find the patch mail of the current branch in https://lore.kernel.org/lkml/?q=%22futex%3A+Cure+exit+race%22 Therefore, I re-integrated commit ac31c7ff8624 ("futex: Provide distinct return value when owner is exiting"). And wrote a cover (but forgot to adjust the title of the cover): https://lore.kernel.org/lkml/20210222070328.102384-1-nixiaom...@huawei.com/ So this is a "fixup" patch, right? Please clearly label it as such in your patch description and resend this as what is here I can not apply at all. thanks, greg k-h . Thank you for your guidance. I have updated the patch description and resent the patch based on v4.9.258-rc1 https://lore.kernel.org/lkml/20210222125352.110124-1-nixiaom...@huawei.com/ Thanks Xiaoming Ni
[PATCH] futex: fix dead code in attach_to_pi_owner()
From: Thomas Gleixner The handle_exit_race() function is defined in commit c158b461306df82 ("futex: Cure exit race"), which never returns -EBUSY. This results in a small piece of dead code in the attach_to_pi_owner() function: int ret = handle_exit_race(uaddr, uval, p); /* Never return -EBUSY */ ... if (ret == -EBUSY) *exiting = p; /* dead code */ The return value -EBUSY is added to handle_exit_race() in upsteam commit ac31c7ff8624409 ("futex: Provide distinct return value when owner is exiting"). This commit was incorporated into v4.9.255, before the function handle_exit_race() was introduced, whitout Modify handle_exit_race(). To fix dead code, extract the change of handle_exit_race() from commit ac31c7ff8624409 ("futex: Provide distinct return value when owner is exiting"), re-incorporated. Fixes: c158b461306df82 ("futex: Cure exit race") Cc: sta...@vger.kernel.org # 4.9.258-rc1 Signed-off-by: Xiaoming Ni --- kernel/futex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index b65dbb5d60bb..0fd785410150 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1207,11 +1207,11 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, u32 uval2; /* -* If the futex exit state is not yet FUTEX_STATE_DEAD, wait -* for it to finish. +* If the futex exit state is not yet FUTEX_STATE_DEAD, tell the +* caller that the alleged owner is busy. */ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) - return -EAGAIN; + return -EBUSY; /* * Reread the user space value to handle the following situation: -- 2.27.0
Re: [PATCH stable-rc queue/4.9 1/1] futex: Provide distinct return value when owner is exiting
On 2021/2/22 18:16, Greg KH wrote: On Mon, Feb 22, 2021 at 03:03:28PM +0800, Xiaoming Ni wrote: From: Thomas Gleixner commit ac31c7ff8624409ba3c4901df9237a616c187a5d upstream. This commit is already in the 4.9 tree. If the backport was incorrect, say that here, and describe what went wrong and why this commit fixes it. Also state what commit this fixes as well, otherwise this changelog just looks like it is being applied again to the tree, which doesn't make much sense. thanks, greg k-h . I wrote a cover for it. but forgot to adjust the title of the cover: https://lore.kernel.org/lkml/20210222070328.102384-1-nixiaom...@huawei.com/ I found a dead code in the queue/4.9 branch of the stable-rc repository. 2021-02-03: commit c27f392040e2f6 ("futex: Provide distinct return value when owner is exiting") The function handle_exit_race does not exist. Therefore, the change in handle_exit_race() is ignored in the patch round. 2021-02-22: commit e55cb811e612 ("futex: Cure exit race") Define the handle_exit_race() function, but no branch in the function returns EBUSY. As a result, dead code occurs in the attach_to_pi_owner(): int ret = handle_exit_race(uaddr, uval, p); ... if (ret == -EBUSY) *exiting = p; /* dead code */ To fix the dead code, modify the commit e55cb811e612 ("futex: Cure exit race"), or install a patch to incorporate the changes in handle_exit_race(). I am unfamiliar with the processing of the stable-rc queue branch, and I cannot find the patch mail of the current branch in https://lore.kernel.org/lkml/?q=%22futex%3A+Cure+exit+race%22 Therefore, I re-integrated commit ac31c7ff8624 ("futex: Provide distinct return value when owner is exiting"). And wrote a cover (but forgot to adjust the title of the cover): https://lore.kernel.org/lkml/20210222070328.102384-1-nixiaom...@huawei.com/ Thanks Xiaoming Ni
[PATCH stable-rc queue/4.9 0/1] repatch
I found a dead code in the queue/4.9 branch of the stable-rc repository. 2021-02-03: commit c27f392040e2f6 ("futex: Provide distinct return value when owner is exiting") The function handle_exit_race does not exist. Therefore, the change in handle_exit_race() is ignored in the patch round. 2021-02-22: commit e55cb811e612 ("futex: Cure exit race") Define the handle_exit_race() function, but no branch in the function returns EBUSY. As a result, dead code occurs in the attach_to_pi_owner(): int ret = handle_exit_race(uaddr, uval, p); ... if (ret == -EBUSY) *exiting = p; /* dead code */ To fix the dead code, modify the commit e55cb811e612 ("futex: Cure exit race"), or install a patch to incorporate the changes in handle_exit_race(). I am unfamiliar with the processing of the stable-rc queue branch, and I cannot find the patch mail of the current branch in https://lore.kernel.org/lkml/?q=%22futex%3A+Cure+exit+race%22 Therefore, I re-integrated commit ac31c7ff8624 ("futex: Provide distinct return value when owner is exiting"). - Thomas Gleixner (1): futex: Provide distinct return value when owner is exiting kernel/futex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- 2.27.0
[PATCH stable-rc queue/4.9 1/1] futex: Provide distinct return value when owner is exiting
From: Thomas Gleixner commit ac31c7ff8624409ba3c4901df9237a616c187a5d upstream. attach_to_pi_owner() returns -EAGAIN for various cases: - Owner task is exiting - Futex value has changed The caller drops the held locks (hash bucket, mmap_sem) and retries the operation. In case of the owner task exiting this can result in a live lock. As a preparatory step for seperating those cases, provide a distinct return value (EBUSY) for the owner exiting case. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191106224556.935606...@linutronix.de [nixiaoming: Modify handle_exit_race() to avoid dead code.] Cc: sta...@vger.kernel.org # queue/4.9 Signed-off-by: Xiaoming Ni --- kernel/futex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index b65dbb5d60bb..0fd785410150 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1207,11 +1207,11 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, u32 uval2; /* -* If the futex exit state is not yet FUTEX_STATE_DEAD, wait -* for it to finish. +* If the futex exit state is not yet FUTEX_STATE_DEAD, tell the +* caller that the alleged owner is busy. */ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) - return -EAGAIN; + return -EBUSY; /* * Reread the user space value to handle the following situation: -- 2.27.0
[PATCH v4] proc_sysctl: fix oops caused by incorrect command parameters.
The process_sysctl_arg() does not check whether val is empty before invoking strlen(val). If the command line parameter () is incorrectly configured and val is empty, oops is triggered. For example: "hung_task_panic=1" is incorrectly written as "hung_task_panic", oops is triggered. The call stack is as follows: Kernel command line: hung_task_panic .. Call trace: __pi_strlen+0x10/0x98 parse_args+0x278/0x344 do_sysctl_args+0x8c/0xfc kernel_init+0x5c/0xf4 ret_from_fork+0x10/0x30 To fix it, check whether "val" is empty when "phram" is a sysctl field. Error codes are returned in the failure branch, and error logs are generated by parse_args(). Fixes: 3db978d480e2843 ("kernel/sysctl: support setting sysctl parameters from kernel command line") Cc: sta...@kernel.org # v5.8-rc1+ Signed-off-by: Xiaoming Ni - v4: According to Vlastimil Babka's recommendations add check len == 0, and cc stable v3: https://lore.kernel.org/lkml/20210112033155.91502-1-nixiaom...@huawei.com/ Return -EINVAL, When phram is the sysctl field and val is empty. v2: https://lore.kernel.org/lkml/20210108023339.55917-1-nixiaom...@huawei.com/ Added log output of the failure branch based on the review comments of Kees Cook. v1: https://lore.kernel.org/lkml/20201224074256.117413-1-nixiaom...@huawei.com/ - --- fs/proc/proc_sysctl.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 317899222d7f..d2018f70d1fa 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1770,6 +1770,12 @@ static int process_sysctl_arg(char *param, char *val, return 0; } + if (!val) + return -EINVAL; + len = strlen(val); + if (len == 0) + return -EINVAL; + /* * To set sysctl options, we use a temporary mount of proc, look up the * respective sys/ file and write to it. To avoid mounting it when no @@ -1811,7 +1817,6 @@ static int process_sysctl_arg(char *param, char *val, file, param, val); goto out; } - len = strlen(val); wret = kernel_write(file, val, len, ); if (wret < 0) { err = wret; -- 2.27.0
Re: [PATCH v3] proc_sysctl: fix oops caused by incorrect command parameters.
On 2021/1/12 19:42, Vlastimil Babka wrote: On 1/12/21 8:24 AM, Michal Hocko wrote: If we're going to do a separate "patch: make process_sysctl_arg() return an errno instead of 0" then fine, we can discuss that. But it's conceptually a different work from fixing this situation. . However, are the logs generated by process_sysctl_arg() clearer and more accurate than parse_args()? Should the logs generated by process_sysctl_arg() be deleted? I think the individual logs are very useful and should be retained. Yes, other sysfs specific error messages are likely useful. I just fail to see why a missing value should be handled here when there is an existing handling in the caller. Not sure whether a complete shadow reporting in process_sysctl_arg is a deliberate decision or not. Vlastimil? Yes, it's a way to have more useful sysctl-specific reports than the generic ones. And I think I was inspired by some other existing code, but don't remember exactly. The options are: 1) the current sysctl-specific reports, return 0 as the values are only consumed 2) be silent and return error, invent new error codes to have generic report be more useful for sysctl, but inevitably lose some nuances anyway 3) a mix where 2) is used for situations where generic report is sufficient enough, 1) where not Patch v2 went with option 1), v3 with option 3). I think it's down to preferences. I would personally go with v2 and message similar to the existing ones, i.e.: "Failed to set sysctl parameter '%s': no value given\n" Also we seem to be silently doing nothing when strlen(val) == 0, i.e. "hung_task_panic=" was passed. Worth reporting the same error. But v3 is fine with me as well. The generic error message works. We could just add "if (!len) return -EINVAL" below the strlen() call. Also please Cc: stable. Anyway one way or the other, all I care about is to have a reporting in place because this shouldn't be a silent failure. The current v2 is already in the linux-next branch and throws a new error: https://lore.kernel.org/lkml/cb54e349-7147-0a1f-a349-1e16ba603...@infradead.org/ This bug has been mentioned in the previous discussion and has been fixed in the current v3 patch. https://lore.kernel.org/linux-fsdevel/20210149.20A58E1@keescook/ What am I supposed to do now? - Resend V3? - Rewrite a new fix patch based on the current code of linux-next. - Develop a new V4 patch: Use V2 to discuss how to use the Patch4 solution. https://lore.kernel.org/linux-fsdevel/bc098af4-c0cd-212e-d09d-46d617d0a...@huawei.com/#t Thanks Xiaoming Ni
Re: [PATCH v3] proc_sysctl: fix oops caused by incorrect command parameters.
On 2021/1/12 12:33, Andrew Morton wrote: On Tue, 12 Jan 2021 11:31:55 +0800 Xiaoming Ni wrote: The process_sysctl_arg() does not check whether val is empty before invoking strlen(val). If the command line parameter () is incorrectly configured and val is empty, oops is triggered. --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1770,6 +1770,9 @@ static int process_sysctl_arg(char *param, char *val, return 0; } + if (!val) + return -EINVAL; + I think v2 (return 0) was preferable. Because all the other error-out cases in process_sysctl_arg() also do a `return 0'. https://lore.kernel.org/lkml/bc098af4-c0cd-212e-d09d-46d617d0a...@huawei.com/ patch4: +++ b/fs/proc/proc_sysctl.c @@ -1757,6 +1757,9 @@ static int process_sysctl_arg(char *param, char *val, loff_t pos = 0; ssize_t wret; + if (!val) + return 0; + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { param += sizeof("sysctl") - 1; Is this the version you're talking about? If we're going to do a separate "patch: make process_sysctl_arg() return an errno instead of 0" then fine, we can discuss that. But it's conceptually a different work from fixing this situation. . However, are the logs generated by process_sysctl_arg() clearer and more accurate than parse_args()? Should the logs generated by process_sysctl_arg() be deleted? Thanks Xiaoming Ni
[PATCH v3] proc_sysctl: fix oops caused by incorrect command parameters.
The process_sysctl_arg() does not check whether val is empty before invoking strlen(val). If the command line parameter () is incorrectly configured and val is empty, oops is triggered. For example: "hung_task_panic=1" is incorrectly written as "hung_task_panic", oops is triggered. The call stack is as follows: Kernel command line: hung_task_panic .. Call trace: __pi_strlen+0x10/0x98 parse_args+0x278/0x344 do_sysctl_args+0x8c/0xfc kernel_init+0x5c/0xf4 ret_from_fork+0x10/0x30 To fix it, check whether "val" is empty when "phram" is a sysctl field. Error codes are returned in the failure branch, and error logs are generated by parse_args(). Fixes: 3db978d480e2843 ("kernel/sysctl: support setting sysctl parameters from kernel command line") Signed-off-by: Xiaoming Ni - v3: Return -EINVAL, When phram is the sysctl field and val is empty. v2: https://lore.kernel.org/lkml/20210108023339.55917-1-nixiaom...@huawei.com/ Added log output of the failure branch based on the review comments of Kees Cook. v1: https://lore.kernel.org/lkml/20201224074256.117413-1-nixiaom...@huawei.com/ - --- fs/proc/proc_sysctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 317899222d7f..d493a50058a5 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1770,6 +1770,9 @@ static int process_sysctl_arg(char *param, char *val, return 0; } + if (!val) + return -EINVAL; + /* * To set sysctl options, we use a temporary mount of proc, look up the * respective sys/ file and write to it. To avoid mounting it when no -- 2.27.0
Re: [PATCH v2] proc_sysctl: fix oops caused by incorrect command parameters.
On 2021/1/9 9:50, Andrew Morton wrote: On Fri, 8 Jan 2021 21:10:25 +0100 Michal Hocko wrote: Why would that matter? A missing value is clearly a error path and it should be reported. This test is in the correct place. I think it's just a question of the return values. I was probably not clear. The test for val is at the right place. I would just expect -EINVAL and have the generic code to report. It does seem a bit screwy that process_sysctl_arg() returns zero in all situations (parse_args() is set up to handle an error return from it). But this patch is consistent with all the other error handling in process_sysctl_arg(). . Set the kernel startup parameter to "nosmp nokaslr hung_task_panic" and test the startup logs of different patches. patch1: +++ b/fs/proc/proc_sysctl.c @@ -1757,6 +1757,11 @@ static int process_sysctl_arg(char *param, char *val, loff_t pos = 0; ssize_t wret; + if (!val) { + pr_err("Missing param value! Expected '%s=...value...'\n", param); + return 0; + } + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { param += sizeof("sysctl") - 1; sysctl log for patch1: Missing param value! Expected 'nosmp=...value...' Missing param value! Expected 'nokaslr=...value...' Missing param value! Expected 'hung_task_panic=...value...' patch2: +++ b/fs/proc/proc_sysctl.c @@ -1756,6 +1756,8 @@ static int process_sysctl_arg(char *param, char *val, int err; loff_t pos = 0; ssize_t wret; + if (!val) + return -EINVAL; if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { param += sizeof("sysctl") - 1; sysctl log for patch2: Setting sysctl args: `' invalid for parameter `nosmp' Setting sysctl args: `' invalid for parameter `nokaslr' Setting sysctl args: `' invalid for parameter `hung_task_panic' patch3: +++ b/fs/proc/proc_sysctl.c @@ -1770,6 +1770,9 @@ static int process_sysctl_arg(char *param, char *val, return 0; } + if (!val) + return -EINVAL; + /* * To set sysctl options, we use a temporary mount of proc, look up the * respective sys/ file and write to it. To avoid mounting it when no sysctl log for patch3: Setting sysctl args: `' invalid for parameter `hung_task_panic' patch4: +++ b/fs/proc/proc_sysctl.c @@ -1757,6 +1757,9 @@ static int process_sysctl_arg(char *param, char *val, loff_t pos = 0; ssize_t wret; + if (!val) + return 0; + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { param += sizeof("sysctl") - 1; sysctl log for patch3: no log When process_sysctl_arg() is called, the param parameter may not be the sysctl parameter. Patch3 or patch4, which is better? Thanks Xiaoming Ni
Re: [PATCH v2] proc_sysctl: fix oops caused by incorrect command parameters.
On 2021/1/9 17:10, Andy Shevchenko wrote: On Friday, January 8, 2021, Xiaoming Ni <mailto:nixiaom...@huawei.com>> wrote: The process_sysctl_arg() does not check whether val is empty before invoking strlen(val). If the command line parameter () is incorrectly configured and val is empty, oops is triggered. For example, "hung_task_panic=1" is incorrectly written as "hung_task_panic". log: Can you drop redundant (not significant) lines from the log to avoid noisy commit message? ok, Thank you for your advice. I will update it in v3 patch. Thanks Xiaoming Ni.
Re: [PATCH v2] proc_sysctl: fix oops caused by incorrect command parameters.
On 2021/1/8 17:21, Michal Hocko wrote: On Fri 08-01-21 10:33:39, Xiaoming Ni wrote: The process_sysctl_arg() does not check whether val is empty before invoking strlen(val). If the command line parameter () is incorrectly configured and val is empty, oops is triggered. For example, "hung_task_panic=1" is incorrectly written as "hung_task_panic". log: Kernel command line: hung_task_panic [000n] user address but active_mm is swapper Internal error: Oops: 9605 [#1] SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.10.1 #1 Hardware name: linux,dummy-virt (DT) pstate: 4005 (nZcv daif -PAN -UAO -TCO BTYPE=--) pc : __pi_strlen+0x10/0x98 lr : process_sysctl_arg+0x1e4/0x2ac sp : ffc01104bd40 x29: ffc01104bd40 x28: x27: ff80c0a4691e x26: ffc0102a7c8c x25: x24: ffc01104be80 x23: ff80c22f0b00 x22: ff80c02e28c0 x21: ffc0109f9000 x20: x19: ffc0107c08de x18: 0003 x17: ffc01105d000 x16: 0054 x15: x14: 3030253078413830 x13: x12: x11: 0101010101010101 x10: 0005 x9 : 0003 x8 : ff80c0980c08 x7 : x6 : 0002 x5 : ff80c0235000 x4 : ff810f7c7ee0 x3 : 043a x2 : 00bdcc4ebacf1a54 x1 : x0 : Call trace: __pi_strlen+0x10/0x98 parse_args+0x278/0x344 do_sysctl_args+0x8c/0xfc kernel_init+0x5c/0xf4 ret_from_fork+0x10/0x30 Code: b200c3eb 927cec01 f2400c07 54000301 (a8c10c22) Fixes: 3db978d480e2843 ("kernel/sysctl: support setting sysctl parameters from kernel command line") Signed-off-by: Xiaoming Ni Thanks for catching this! - v2: Added log output of the failure branch based on the review comments of Kees Cook. v1: https://lore.kernel.org/lkml/20201224074256.117413-1-nixiaom...@huawei.com/ - --- fs/proc/proc_sysctl.c | 5 + 1 file changed, 5 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 317899222d7f..dc1a56515e86 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1757,6 +1757,11 @@ static int process_sysctl_arg(char *param, char *val, loff_t pos = 0; ssize_t wret; + if (!val) { + pr_err("Missing param value! Expected '%s=...value...'\n", param); + return 0; I may need to move the validation code for val to the end of the validation code for param to prevent non-sysctl arguments from triggering the current print. Or delete the print and keep it silent for a little better performance. Which is better? + } Shouldn't you return an error here? Also my understanding is that parse_args is responsible for reporting the error. All exception branches in process_sysctl_arg record logs and return 0. Do I need to keep the same processing in the new branch? + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { param += sizeof("sysctl") - 1; -- 2.27.0 Thanks Xiaoming Ni
[PATCH v2] proc_sysctl: fix oops caused by incorrect command parameters.
The process_sysctl_arg() does not check whether val is empty before invoking strlen(val). If the command line parameter () is incorrectly configured and val is empty, oops is triggered. For example, "hung_task_panic=1" is incorrectly written as "hung_task_panic". log: Kernel command line: hung_task_panic [000n] user address but active_mm is swapper Internal error: Oops: 9605 [#1] SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.10.1 #1 Hardware name: linux,dummy-virt (DT) pstate: 4005 (nZcv daif -PAN -UAO -TCO BTYPE=--) pc : __pi_strlen+0x10/0x98 lr : process_sysctl_arg+0x1e4/0x2ac sp : ffc01104bd40 x29: ffc01104bd40 x28: x27: ff80c0a4691e x26: ffc0102a7c8c x25: x24: ffc01104be80 x23: ff80c22f0b00 x22: ff80c02e28c0 x21: ffc0109f9000 x20: x19: ffc0107c08de x18: 0003 x17: ffc01105d000 x16: 0054 x15: x14: 3030253078413830 x13: x12: x11: 0101010101010101 x10: 0005 x9 : 0003 x8 : ff80c0980c08 x7 : x6 : 0002 x5 : ff80c0235000 x4 : ff810f7c7ee0 x3 : 043a x2 : 00bdcc4ebacf1a54 x1 : x0 : Call trace: __pi_strlen+0x10/0x98 parse_args+0x278/0x344 do_sysctl_args+0x8c/0xfc kernel_init+0x5c/0xf4 ret_from_fork+0x10/0x30 Code: b200c3eb 927cec01 f2400c07 54000301 (a8c10c22) Fixes: 3db978d480e2843 ("kernel/sysctl: support setting sysctl parameters from kernel command line") Signed-off-by: Xiaoming Ni - v2: Added log output of the failure branch based on the review comments of Kees Cook. v1: https://lore.kernel.org/lkml/20201224074256.117413-1-nixiaom...@huawei.com/ - --- fs/proc/proc_sysctl.c | 5 + 1 file changed, 5 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 317899222d7f..dc1a56515e86 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1757,6 +1757,11 @@ static int process_sysctl_arg(char *param, char *val, loff_t pos = 0; ssize_t wret; + if (!val) { + pr_err("Missing param value! Expected '%s=...value...'\n", param); + return 0; + } + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { param += sizeof("sysctl") - 1; -- 2.27.0
Re: [PATCH] proc_sysclt: fix oops caused by incorrect command parameters.
On 2021/1/7 7:46, Kees Cook wrote: subject typo: "sysclt" -> "sysctl" On Thu, Dec 24, 2020 at 03:42:56PM +0800, Xiaoming Ni wrote: The process_sysctl_arg() does not check whether val is empty before invoking strlen(val). If the command line parameter () is incorrectly configured and val is empty, oops is triggered. For example, "hung_task_panic=1" is incorrectly written as "hung_task_panic". log: Kernel command line: hung_task_panic [000n] user address but active_mm is swapper Internal error: Oops: 9605 [#1] SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.10.1 #1 Hardware name: linux,dummy-virt (DT) pstate: 4005 (nZcv daif -PAN -UAO -TCO BTYPE=--) pc : __pi_strlen+0x10/0x98 lr : process_sysctl_arg+0x1e4/0x2ac sp : ffc01104bd40 x29: ffc01104bd40 x28: x27: ff80c0a4691e x26: ffc0102a7c8c x25: x24: ffc01104be80 x23: ff80c22f0b00 x22: ff80c02e28c0 x21: ffc0109f9000 x20: x19: ffc0107c08de x18: 0003 x17: ffc01105d000 x16: 0054 x15: x14: 3030253078413830 x13: x12: x11: 0101010101010101 x10: 0005 x9 : 0003 x8 : ff80c0980c08 x7 : x6 : 0002 x5 : ff80c0235000 x4 : ff810f7c7ee0 x3 : 043a x2 : 00bdcc4ebacf1a54 x1 : x0 : Call trace: __pi_strlen+0x10/0x98 parse_args+0x278/0x344 do_sysctl_args+0x8c/0xfc kernel_init+0x5c/0xf4 ret_from_fork+0x10/0x30 Code: b200c3eb 927cec01 f2400c07 54000301 (a8c10c22) Fixes: 3db978d480e2843 ("kernel/sysctl: support setting sysctl parameters from kernel command line") Signed-off-by: Xiaoming Ni --- fs/proc/proc_sysctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 317899222d7f..4516411a2b44 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1757,6 +1757,9 @@ static int process_sysctl_arg(char *param, char *val, loff_t pos = 0; ssize_t wret; + if (!val) + return 0; + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { param += sizeof("sysctl") - 1; Otherwise, yeah, this is a good test to add. I would make it more verbose, though: if (!val) { pr_err("Missing param value! Expected '%s=...value...'\n", param); return 0; } Yes, it's better to add log output. Thank you for your review. Do I need to send V2 patch based on review comments? Thanks Xiaoming Ni
ping //Re: [PATCH] proc_sysclt: fix oops caused by incorrect command parameters.
ping On 2020/12/24 15:42, Xiaoming Ni wrote: The process_sysctl_arg() does not check whether val is empty before invoking strlen(val). If the command line parameter () is incorrectly configured and val is empty, oops is triggered. For example, "hung_task_panic=1" is incorrectly written as "hung_task_panic". log: Kernel command line: hung_task_panic [000n] user address but active_mm is swapper Internal error: Oops: 9605 [#1] SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.10.1 #1 Hardware name: linux,dummy-virt (DT) pstate: 4005 (nZcv daif -PAN -UAO -TCO BTYPE=--) pc : __pi_strlen+0x10/0x98 lr : process_sysctl_arg+0x1e4/0x2ac sp : ffc01104bd40 x29: ffc01104bd40 x28: x27: ff80c0a4691e x26: ffc0102a7c8c x25: x24: ffc01104be80 x23: ff80c22f0b00 x22: ff80c02e28c0 x21: ffc0109f9000 x20: x19: ffc0107c08de x18: 0003 x17: ffc01105d000 x16: 0054 x15: x14: 3030253078413830 x13: x12: x11: 0101010101010101 x10: 0005 x9 : 0003 x8 : ff80c0980c08 x7 : x6 : 0002 x5 : ff80c0235000 x4 : ff810f7c7ee0 x3 : 043a x2 : 00bdcc4ebacf1a54 x1 : x0 : Call trace: __pi_strlen+0x10/0x98 parse_args+0x278/0x344 do_sysctl_args+0x8c/0xfc kernel_init+0x5c/0xf4 ret_from_fork+0x10/0x30 Code: b200c3eb 927cec01 f2400c07 54000301 (a8c10c22) Fixes: 3db978d480e2843 ("kernel/sysctl: support setting sysctl parameters from kernel command line") Signed-off-by: Xiaoming Ni --- fs/proc/proc_sysctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 317899222d7f..4516411a2b44 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1757,6 +1757,9 @@ static int process_sysctl_arg(char *param, char *val, loff_t pos = 0; ssize_t wret; + if (!val) + return 0; + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { param += sizeof("sysctl") - 1;
[PATCH] proc_sysclt: fix oops caused by incorrect command parameters.
The process_sysctl_arg() does not check whether val is empty before invoking strlen(val). If the command line parameter () is incorrectly configured and val is empty, oops is triggered. For example, "hung_task_panic=1" is incorrectly written as "hung_task_panic". log: Kernel command line: hung_task_panic [000n] user address but active_mm is swapper Internal error: Oops: 9605 [#1] SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.10.1 #1 Hardware name: linux,dummy-virt (DT) pstate: 4005 (nZcv daif -PAN -UAO -TCO BTYPE=--) pc : __pi_strlen+0x10/0x98 lr : process_sysctl_arg+0x1e4/0x2ac sp : ffc01104bd40 x29: ffc01104bd40 x28: x27: ff80c0a4691e x26: ffc0102a7c8c x25: x24: ffc01104be80 x23: ff80c22f0b00 x22: ff80c02e28c0 x21: ffc0109f9000 x20: x19: ffc0107c08de x18: 0003 x17: ffc01105d000 x16: 0054 x15: x14: 3030253078413830 x13: x12: x11: 0101010101010101 x10: 0005 x9 : 0003 x8 : ff80c0980c08 x7 : x6 : 0002 x5 : ff80c0235000 x4 : ff810f7c7ee0 x3 : 043a x2 : 00bdcc4ebacf1a54 x1 : x0 : Call trace: __pi_strlen+0x10/0x98 parse_args+0x278/0x344 do_sysctl_args+0x8c/0xfc kernel_init+0x5c/0xf4 ret_from_fork+0x10/0x30 Code: b200c3eb 927cec01 f2400c07 54000301 (a8c10c22) Fixes: 3db978d480e2843 ("kernel/sysctl: support setting sysctl parameters from kernel command line") Signed-off-by: Xiaoming Ni --- fs/proc/proc_sysctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 317899222d7f..4516411a2b44 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1757,6 +1757,9 @@ static int process_sysctl_arg(char *param, char *val, loff_t pos = 0; ssize_t wret; + if (!val) + return 0; + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { param += sizeof("sysctl") - 1; -- 2.27.0
Re: [PATCH] powerpc:Don't print raw EIP/LR hex values in dump_stack() and show_regs()
On 2020/12/22 1:12, Segher Boessenkool wrote: On Mon, Dec 21, 2020 at 04:42:23PM +, David Laight wrote: From: Segher Boessenkool Sent: 21 December 2020 16:32 On Mon, Dec 21, 2020 at 04:17:21PM +0100, Christophe Leroy wrote: Le 21/12/2020 à 04:27, Xiaoming Ni a écrit : Since the commit 2b0e86cc5de6 ("powerpc/fsl_booke/32: implement KASLR infrastructure"), the powerpc system is ready to support KASLR. To reduces the risk of invalidating address randomization, don't print the EIP/LR hex values in dump_stack() and show_regs(). I think your change is not enough to hide EIP address, see below a dump with you patch, you get "Faulting instruction address: 0xc03a0c14" As far as I can see the patch does nothing to the GPR printout. Often GPRs contain code addresses. As one example, the LR is moved via a GPR (often GPR0, but not always) for storing on the stack. So this needs more work. If the dump_stack() is from an oops you need the real EIP value on order to stand any chance of making headway. Or at least the function name + offset, yes. When the system is healthy, only symbols and offsets are printed, Output address and symbol + offset when the system is dying Does this meet both debugging and security requirements? For example: +static void __show_regs_ip_lr(const char *flag, unsigned long addr) +{ + if (system_going_down()) { /* panic oops reboot */ + pr_cont("%s["REG"] %pS", flag, addr, (void *)addr); + } else { + pr_cont("%s%pS", flag, (void *)addr); + } +} + static void __show_regs(struct pt_regs *regs) { int i, trap; - printk("NIP: "REG" LR: "REG" CTR: "REG"\n", - regs->nip, regs->link, regs->ctr); + __show_regs_ip_lr("NIP: ", regs->nip); + __show_regs_ip_lr(" LR: ", regs->link); + pr_cont(" CTR: "REG"\n", regs->ctr); printk("REGS: %px TRAP: %04lx %s (%s)\n", regs, regs->trap, print_tainted(), init_utsname()->release); printk("MSR: "REG" ", regs->msr); Otherwise you might just as well just print 'borked - tough luck'. Yes. ASLR is a house of cards. But that isn't constructive wrt this patch :-) Segher . Thanks Xiaoming Ni
[PATCH] powerpc:Don't print raw EIP/LR hex values in dump_stack() and show_regs()
Since the commit 2b0e86cc5de6 ("powerpc/fsl_booke/32: implement KASLR infrastructure"), the powerpc system is ready to support KASLR. To reduces the risk of invalidating address randomization, don't print the EIP/LR hex values in dump_stack() and show_regs(). This patch follows x86 and arm64's lead: commit a25ffd3a6302a6 ("arm64: traps: Don't print stack or raw PC/LR values in backtraces") commit bb5e5ce545f203 ("x86/dumpstack: Remove kernel text addresses from stack dump") Signed-off-by: Xiaoming Ni --- arch/powerpc/kernel/process.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a66f435dabbf..913cf1ea702e 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1455,8 +1455,8 @@ static void __show_regs(struct pt_regs *regs) { int i, trap; - printk("NIP: "REG" LR: "REG" CTR: "REG"\n", - regs->nip, regs->link, regs->ctr); + printk("NIP: %pS LR: %pS CTR: "REG"\n", + (void *)regs->nip, (void *)regs->link, regs->ctr); printk("REGS: %px TRAP: %04lx %s (%s)\n", regs, regs->trap, print_tainted(), init_utsname()->release); printk("MSR: "REG" ", regs->msr); @@ -1493,8 +1493,8 @@ static void __show_regs(struct pt_regs *regs) * above info out without failing */ if (IS_ENABLED(CONFIG_KALLSYMS)) { - printk("NIP ["REG"] %pS\n", regs->nip, (void *)regs->nip); - printk("LR ["REG"] %pS\n", regs->link, (void *)regs->link); + printk("NIP %pS\n", (void *)regs->nip); + printk("LR %pS\n", (void *)regs->link); } } @@ -2160,8 +2160,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack, newsp = stack[0]; ip = stack[STACK_FRAME_LR_SAVE]; if (!firstframe || ip != lr) { - printk("%s["REG"] ["REG"] %pS", - loglvl, sp, ip, (void *)ip); + printk("%s ["REG"] %pS", + loglvl, sp, (void *)ip); ret_addr = ftrace_graph_ret_addr(current, _idx, ip, stack); if (ret_addr != ip) -- 2.27.0
[PATCH v2 0/4] panic: Add new API in_panic_state()
For some features (such as hang_task, ledtrig-activity, ledtrig-heartbeat) different processing logics need to be performed based on whether the current system is in panic state: 1: Register hook for panic_notifier_list. 2. Assign a value to the global variable in the hook function. 3. Determine whether the system is in panic state based on the global variable and perform different processing. Duplicate code snippets exist, and the timing judgment is relatively lag. Therefore, consider extracting the new API: bool in_panic_state(void). v2: Rename api to in_panic_state as recommended by Pavel Machek, Tetsuo Handa, Randy Dunlap. v1: https://lore.kernel.org/lkml/20201218114406.61906-1-nixiaom...@huawei.com/ API name: is_being_panic Xiaoming Ni (4): panic: Add new API in_panic_state() hung_task: Replace "did_panic" with in_panic_state() leds:trigger:ledtrig-activity Replace "panic_detected" with in_panic_state() leds:trigger:ledtrig-heartbeat: Replace "panic_heartbeats" with in_panic_state() drivers/leds/trigger/ledtrig-activity.c | 19 +-- drivers/leds/trigger/ledtrig-heartbeat.c | 19 +-- include/linux/kernel.h | 1 + kernel/hung_task.c | 17 + kernel/panic.c | 6 ++ 5 files changed, 10 insertions(+), 52 deletions(-) -- 2.27.0
[PATCH v2 4/4] leds:trigger:ledtrig-heartbeat: Replace "panic_heartbeats" with in_panic_state()
Replace the global variable "panic_heartbeats" with in_panic_state() Signed-off-by: Xiaoming Ni --- drivers/leds/trigger/ledtrig-heartbeat.c | 19 +-- 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/drivers/leds/trigger/ledtrig-heartbeat.c b/drivers/leds/trigger/ledtrig-heartbeat.c index 36b6709afe9f..f24d64cf0a62 100644 --- a/drivers/leds/trigger/ledtrig-heartbeat.c +++ b/drivers/leds/trigger/ledtrig-heartbeat.c @@ -19,8 +19,6 @@ #include #include "../leds.h" -static int panic_heartbeats; - struct heartbeat_trig_data { struct led_classdev *led_cdev; unsigned int phase; @@ -39,7 +37,7 @@ static void led_heartbeat_function(struct timer_list *t) led_cdev = heartbeat_data->led_cdev; - if (unlikely(panic_heartbeats)) { + if (unlikely(in_panic_state())) { led_set_brightness_nosleep(led_cdev, LED_OFF); return; } @@ -169,28 +167,15 @@ static int heartbeat_reboot_notifier(struct notifier_block *nb, return NOTIFY_DONE; } -static int heartbeat_panic_notifier(struct notifier_block *nb, -unsigned long code, void *unused) -{ - panic_heartbeats = 1; - return NOTIFY_DONE; -} - static struct notifier_block heartbeat_reboot_nb = { .notifier_call = heartbeat_reboot_notifier, }; -static struct notifier_block heartbeat_panic_nb = { - .notifier_call = heartbeat_panic_notifier, -}; - static int __init heartbeat_trig_init(void) { int rc = led_trigger_register(_led_trigger); if (!rc) { - atomic_notifier_chain_register(_notifier_list, - _panic_nb); register_reboot_notifier(_reboot_nb); } return rc; @@ -199,8 +184,6 @@ static int __init heartbeat_trig_init(void) static void __exit heartbeat_trig_exit(void) { unregister_reboot_notifier(_reboot_nb); - atomic_notifier_chain_unregister(_notifier_list, -_panic_nb); led_trigger_unregister(_led_trigger); } -- 2.27.0
[PATCH v2 3/4] leds:trigger:ledtrig-activity Replace "panic_detected" with in_panic_state()
Replace the global variable "panic_detected" with in_panic_state() Signed-off-by: Xiaoming Ni --- drivers/leds/trigger/ledtrig-activity.c | 19 +-- 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/drivers/leds/trigger/ledtrig-activity.c b/drivers/leds/trigger/ledtrig-activity.c index 14ba7faaed9e..9675c9348516 100644 --- a/drivers/leds/trigger/ledtrig-activity.c +++ b/drivers/leds/trigger/ledtrig-activity.c @@ -17,8 +17,6 @@ #include #include "../leds.h" -static int panic_detected; - struct activity_data { struct timer_list timer; struct led_classdev *led_cdev; @@ -47,7 +45,7 @@ static void led_activity_function(struct timer_list *t) if (test_and_clear_bit(LED_BLINK_BRIGHTNESS_CHANGE, _cdev->work_flags)) led_cdev->blink_brightness = led_cdev->new_blink_brightness; - if (unlikely(panic_detected)) { + if (unlikely(in_panic_state())) { /* full brightness in case of panic */ led_set_brightness_nosleep(led_cdev, led_cdev->blink_brightness); return; @@ -226,28 +224,15 @@ static int activity_reboot_notifier(struct notifier_block *nb, return NOTIFY_DONE; } -static int activity_panic_notifier(struct notifier_block *nb, - unsigned long code, void *unused) -{ - panic_detected = 1; - return NOTIFY_DONE; -} - static struct notifier_block activity_reboot_nb = { .notifier_call = activity_reboot_notifier, }; -static struct notifier_block activity_panic_nb = { - .notifier_call = activity_panic_notifier, -}; - static int __init activity_init(void) { int rc = led_trigger_register(_led_trigger); if (!rc) { - atomic_notifier_chain_register(_notifier_list, - _panic_nb); register_reboot_notifier(_reboot_nb); } return rc; @@ -256,8 +241,6 @@ static int __init activity_init(void) static void __exit activity_exit(void) { unregister_reboot_notifier(_reboot_nb); - atomic_notifier_chain_unregister(_notifier_list, -_panic_nb); led_trigger_unregister(_led_trigger); } -- 2.27.0
[PATCH v2 2/4] hung_task: Replace "did_panic" with in_panic_state()
Replace the global variable "did_panic" with in_panic_state() Signed-off-by: Xiaoming Ni --- kernel/hung_task.c | 17 + 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index bb2e3e15c84c..2747cd6dd35e 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -50,7 +50,6 @@ unsigned long __read_mostly sysctl_hung_task_check_interval_secs; int __read_mostly sysctl_hung_task_warnings = 10; -static int __read_mostly did_panic; static bool hung_task_show_lock; static bool hung_task_call_panic; static bool hung_task_show_all_bt; @@ -72,18 +71,6 @@ unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; unsigned int __read_mostly sysctl_hung_task_panic = CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; -static int -hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) -{ - did_panic = 1; - - return NOTIFY_DONE; -} - -static struct notifier_block panic_block = { - .notifier_call = hung_task_panic, -}; - static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; @@ -223,7 +210,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) * If the system crashed already then all bets are off, * do not report extra hung tasks: */ - if (test_taint(TAINT_DIE) || did_panic) + if (test_taint(TAINT_DIE) || unlikely(in_panic_state())) return; hung_task_show_lock = false; @@ -347,8 +334,6 @@ static int watchdog(void *dummy) static int __init hung_task_init(void) { - atomic_notifier_chain_register(_notifier_list, _block); - /* Disable hung task detector on suspend */ pm_notifier(hungtask_pm_notify, 0); -- 2.27.0
[PATCH v2 1/4] panic: Add new API in_panic_state()
For some features (such as hang_task, ledtrig-activity, ledtrig-heartbeat) different processing logics need to be performed based on whether the current system is in panic state: 1: Register hook for panic_notifier_list. 2. Assign a value to the global variable in the hook function. 3. Determine whether the system is in panic state based on the global variable and perform different processing. Duplicate code snippets exist, and the timing judgment is relatively lag. Therefore, consider extracting the new API: bool in_panic_state(void). Signed-off-by: Xiaoming Ni --- include/linux/kernel.h | 1 + kernel/panic.c | 6 ++ 2 files changed, 7 insertions(+) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f7902d8c1048..c9a9078157b6 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -167,6 +167,7 @@ void __might_fault(const char *file, int line); static inline void might_fault(void) { } #endif +extern bool in_panic_state(void); extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); __printf(1, 2) diff --git a/kernel/panic.c b/kernel/panic.c index 332736a72a58..351627883a04 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -125,6 +125,12 @@ void __weak crash_smp_send_stop(void) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +bool in_panic_state(void) +{ + return (atomic_read(_cpu) != PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(in_panic_state); + /* * A variant of panic() called from NMI context. We return if we've already * panicked on this CPU. If another CPU already panicked, loop in -- 2.27.0
Re: [PATCH 2/4] hung_task: Replace "did_panic" with is_be_panic()
On 2020/12/19 1:06, Randy Dunlap wrote: On 12/18/20 6:36 AM, Tetsuo Handa wrote: On 2020/12/18 21:59, Pavel Machek wrote: On Fri 2020-12-18 19:44:04, Xiaoming Ni wrote: Plus.. is_being_panic is not really english. "is_paniccing" would be closer...? Or in_panic() ? Yes, or in_panic_state() Thank you, I'll resend the patch later on according to your suggestion. Thanks Xiaoming Ni .
[PATCH 3/4] leds:trigger:ledtrig-activity Replace "panic_detected" with is_be_panic()
Replace the global variable "panic_detected" with is_be_panic() Signed-off-by: Xiaoming Ni --- drivers/leds/trigger/ledtrig-activity.c | 19 +-- 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/drivers/leds/trigger/ledtrig-activity.c b/drivers/leds/trigger/ledtrig-activity.c index 14ba7faaed9e..bbacb3dbe341 100644 --- a/drivers/leds/trigger/ledtrig-activity.c +++ b/drivers/leds/trigger/ledtrig-activity.c @@ -17,8 +17,6 @@ #include #include "../leds.h" -static int panic_detected; - struct activity_data { struct timer_list timer; struct led_classdev *led_cdev; @@ -47,7 +45,7 @@ static void led_activity_function(struct timer_list *t) if (test_and_clear_bit(LED_BLINK_BRIGHTNESS_CHANGE, _cdev->work_flags)) led_cdev->blink_brightness = led_cdev->new_blink_brightness; - if (unlikely(panic_detected)) { + if (unlikely(is_being_panic())) { /* full brightness in case of panic */ led_set_brightness_nosleep(led_cdev, led_cdev->blink_brightness); return; @@ -226,28 +224,15 @@ static int activity_reboot_notifier(struct notifier_block *nb, return NOTIFY_DONE; } -static int activity_panic_notifier(struct notifier_block *nb, - unsigned long code, void *unused) -{ - panic_detected = 1; - return NOTIFY_DONE; -} - static struct notifier_block activity_reboot_nb = { .notifier_call = activity_reboot_notifier, }; -static struct notifier_block activity_panic_nb = { - .notifier_call = activity_panic_notifier, -}; - static int __init activity_init(void) { int rc = led_trigger_register(_led_trigger); if (!rc) { - atomic_notifier_chain_register(_notifier_list, - _panic_nb); register_reboot_notifier(_reboot_nb); } return rc; @@ -256,8 +241,6 @@ static int __init activity_init(void) static void __exit activity_exit(void) { unregister_reboot_notifier(_reboot_nb); - atomic_notifier_chain_unregister(_notifier_list, -_panic_nb); led_trigger_unregister(_led_trigger); } -- 2.27.0
[PATCH 0/4] Add new API is_being_panic()
Add is_being_panic() to check whether the system is in panic state. Used to replace the global variable used to determine the panic status in other features: hung_task ledtrig-activity ledtrig-heartbeat Xiaoming Ni (4): panic: Add new API is_being_panic() hung_task: Replace "did_panic" with is_be_panic() leds:trigger:ledtrig-activity Replace "panic_detected" with is_be_panic() leds:trigger:ledtrig-heartbeat: Replace "panic_heartbeats" with is_be_panic() drivers/leds/trigger/ledtrig-activity.c | 19 +-- drivers/leds/trigger/ledtrig-heartbeat.c | 19 +-- include/linux/kernel.h | 1 + kernel/hung_task.c | 17 + kernel/panic.c | 6 ++ 5 files changed, 10 insertions(+), 52 deletions(-) -- 2.27.0
[PATCH 2/4] hung_task: Replace "did_panic" with is_be_panic()
Replace the global variable "did_panic" with is_be_panic() Signed-off-by: Xiaoming Ni --- kernel/hung_task.c | 17 + 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index bb2e3e15c84c..3374b993da4c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -50,7 +50,6 @@ unsigned long __read_mostly sysctl_hung_task_check_interval_secs; int __read_mostly sysctl_hung_task_warnings = 10; -static int __read_mostly did_panic; static bool hung_task_show_lock; static bool hung_task_call_panic; static bool hung_task_show_all_bt; @@ -72,18 +71,6 @@ unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; unsigned int __read_mostly sysctl_hung_task_panic = CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; -static int -hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) -{ - did_panic = 1; - - return NOTIFY_DONE; -} - -static struct notifier_block panic_block = { - .notifier_call = hung_task_panic, -}; - static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; @@ -223,7 +210,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) * If the system crashed already then all bets are off, * do not report extra hung tasks: */ - if (test_taint(TAINT_DIE) || did_panic) + if (test_taint(TAINT_DIE) || unlikely(is_being_panic())) return; hung_task_show_lock = false; @@ -347,8 +334,6 @@ static int watchdog(void *dummy) static int __init hung_task_init(void) { - atomic_notifier_chain_register(_notifier_list, _block); - /* Disable hung task detector on suspend */ pm_notifier(hungtask_pm_notify, 0); -- 2.27.0
[PATCH 4/4] leds:trigger:ledtrig-heartbeat: Replace "panic_heartbeats" with is_be_panic()
Replace the global variable "panic_heartbeats" with is_be_panic() Signed-off-by: Xiaoming Ni --- drivers/leds/trigger/ledtrig-heartbeat.c | 19 +-- 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/drivers/leds/trigger/ledtrig-heartbeat.c b/drivers/leds/trigger/ledtrig-heartbeat.c index 36b6709afe9f..3c34d49f0ed8 100644 --- a/drivers/leds/trigger/ledtrig-heartbeat.c +++ b/drivers/leds/trigger/ledtrig-heartbeat.c @@ -19,8 +19,6 @@ #include #include "../leds.h" -static int panic_heartbeats; - struct heartbeat_trig_data { struct led_classdev *led_cdev; unsigned int phase; @@ -39,7 +37,7 @@ static void led_heartbeat_function(struct timer_list *t) led_cdev = heartbeat_data->led_cdev; - if (unlikely(panic_heartbeats)) { + if (unlikely(is_being_panic())) { led_set_brightness_nosleep(led_cdev, LED_OFF); return; } @@ -169,28 +167,15 @@ static int heartbeat_reboot_notifier(struct notifier_block *nb, return NOTIFY_DONE; } -static int heartbeat_panic_notifier(struct notifier_block *nb, -unsigned long code, void *unused) -{ - panic_heartbeats = 1; - return NOTIFY_DONE; -} - static struct notifier_block heartbeat_reboot_nb = { .notifier_call = heartbeat_reboot_notifier, }; -static struct notifier_block heartbeat_panic_nb = { - .notifier_call = heartbeat_panic_notifier, -}; - static int __init heartbeat_trig_init(void) { int rc = led_trigger_register(_led_trigger); if (!rc) { - atomic_notifier_chain_register(_notifier_list, - _panic_nb); register_reboot_notifier(_reboot_nb); } return rc; @@ -199,8 +184,6 @@ static int __init heartbeat_trig_init(void) static void __exit heartbeat_trig_exit(void) { unregister_reboot_notifier(_reboot_nb); - atomic_notifier_chain_unregister(_notifier_list, -_panic_nb); led_trigger_unregister(_led_trigger); } -- 2.27.0
[PATCH 1/4] panic: Add new API is_being_panic()
Add is_being_panic() to check whether the system is in panic state. Used to replace the global variable used to determine the panic status in other features: hung_task ledtrig-activity ledtrig-heartbeat Signed-off-by: Xiaoming Ni --- include/linux/kernel.h | 1 + kernel/panic.c | 6 ++ 2 files changed, 7 insertions(+) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f7902d8c1048..3d6f344771c1 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -167,6 +167,7 @@ void __might_fault(const char *file, int line); static inline void might_fault(void) { } #endif +extern bool is_being_panic(void); extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); __printf(1, 2) diff --git a/kernel/panic.c b/kernel/panic.c index 332736a72a58..3b6a487702b0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -125,6 +125,12 @@ void __weak crash_smp_send_stop(void) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +bool is_being_panic(void) +{ + return (atomic_read(_cpu) != PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(is_being_panic); + /* * A variant of panic() called from NMI context. We return if we've already * panicked on this CPU. If another CPU already panicked, loop in -- 2.27.0
[PATCH] dma/qcom/gpi: Fixes a format mismatch
drivers/dma/qcom/gpi.c:1419:3: warning: format '%lu' expects argument of type 'long unsigned int', but argument 8 has type 'size_t {aka unsigned int}' [-Wformat=] drivers/dma/qcom/gpi.c:1427:31: warning: format '%lu' expects argument of type 'long unsigned int', but argument 3 has type 'size_t {aka unsigned int}' [-Wformat=] drivers/dma/qcom/gpi.c:1447:3: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 4 has type 'dma_addr_t {aka unsigned int}' [-Wformat=] drivers/dma/qcom/gpi.c:1447:3: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 5 has type 'phys_addr_t {aka unsigned int}' [-Wformat=] Signed-off-by: Xiaoming Ni --- drivers/dma/qcom/gpi.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c index d2334f535de2..556c070a514c 100644 --- a/drivers/dma/qcom/gpi.c +++ b/drivers/dma/qcom/gpi.c @@ -1416,7 +1416,7 @@ static int gpi_alloc_ring(struct gpi_ring *ring, u32 elements, len = 1 << bit; ring->alloc_size = (len + (len - 1)); dev_dbg(gpii->gpi_dev->dev, - "#el:%u el_size:%u len:%u actual_len:%llu alloc_size:%lu\n", + "#el:%u el_size:%u len:%u actual_len:%llu alloc_size:%zu\n", elements, el_size, (elements * el_size), len, ring->alloc_size); @@ -1424,7 +1424,7 @@ static int gpi_alloc_ring(struct gpi_ring *ring, u32 elements, ring->alloc_size, >dma_handle, GFP_KERNEL); if (!ring->pre_aligned) { - dev_err(gpii->gpi_dev->dev, "could not alloc size:%lu mem for ring\n", + dev_err(gpii->gpi_dev->dev, "could not alloc size:%zu mem for ring\n", ring->alloc_size); return -ENOMEM; } @@ -1444,8 +1444,8 @@ static int gpi_alloc_ring(struct gpi_ring *ring, u32 elements, smp_wmb(); dev_dbg(gpii->gpi_dev->dev, - "phy_pre:0x%0llx phy_alig:0x%0llx len:%u el_size:%u elements:%u\n", - ring->dma_handle, ring->phys_addr, ring->len, + "phy_pre:%pad phy_alig:%pa len:%u el_size:%u elements:%u\n", + >dma_handle, >phys_addr, ring->len, ring->el_size, ring->elements); return 0; -- 2.27.0
Re: ping // [PATCH] mtd:cfi_cmdset_0002: fix atomic sleep bug when CONFIG_MTD_XIP=y
On 2020/12/8 2:59, Vignesh Raghavendra wrote: Hi Xiaoming, On 12/7/20 4:23 PM, Miquel Raynal wrote: Hi Xiaoming, Xiaoming Ni wrote on Mon, 7 Dec 2020 18:48:33 +0800: ping On 2020/11/27 21:07, Xiaoming Ni wrote: When CONFIG_MTD_XIP=y, local_irq_disable() is called in xip_disable(). To avoid sleep in interrupt context, we need to call local_irq_enable() before schedule(). The problem call stack is as follows: bug1: do_write_oneword_retry() xip_disable() local_irq_disable() do_write_oneword_once() schedule() bug2: do_write_buffer() xip_disable() local_irq_disable() do_write_buffer_wait() schedule() bug3: do_erase_chip() xip_disable() local_irq_disable() schedule() bug4: do_erase_oneblock() xip_disable() local_irq_disable() schedule() Fixes: 02b15e343aee ("[MTD] XIP for AMD CFI flash.") Cc: sta...@vger.kernel.org # v2.6.13 Signed-off-by: Xiaoming Ni --- drivers/mtd/chips/cfi_cmdset_0002.c | 16 1 file changed, 16 insertions(+) diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c index a1f3e1031c3d..12c3776f093a 100644 --- a/drivers/mtd/chips/cfi_cmdset_0002.c +++ b/drivers/mtd/chips/cfi_cmdset_0002.c @@ -1682,7 +1682,11 @@ static int __xipram do_write_oneword_once(struct map_info *map, set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(>wq, ); mutex_unlock(>mutex); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_enable(); schedule(); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_disable(); The fix really seems strange to me. I will let Vignesh decide but I think we should consider updating/fixing xip_disable instead. Agree with Miquel. Have you done any testing or is this purely based on code inspection? I don't have the corresponding device test environment. I found the problem through code review. What about comment before xip_disable() function: /* * No interrupt what so ever can be serviced while the flash isn't in array * mode. This is ensured by the xip_disable() and xip_enable() functions * enclosing any code path where the flash is known not to be in array mode. * And within a XIP disabled code path, only functions marked with __xipram * may be called and nothing else (it's a good thing to inspect generated * assembly to make sure inline functions were actually inlined and that gcc * didn't emit calls to its own support functions). Also configuring MTD CFI * support to a single buswidth and a single interleave is also recommended. */ So, I don't think the fix is as simple as this patch. +xip_enable(); schedule(); +xip_disable(); Do I need to change it to this? Regards Vignesh . Thanks Xiaoming Ni
ping // [PATCH] mtd:cfi_cmdset_0002: fix atomic sleep bug when CONFIG_MTD_XIP=y
ping On 2020/11/27 21:07, Xiaoming Ni wrote: When CONFIG_MTD_XIP=y, local_irq_disable() is called in xip_disable(). To avoid sleep in interrupt context, we need to call local_irq_enable() before schedule(). The problem call stack is as follows: bug1: do_write_oneword_retry() xip_disable() local_irq_disable() do_write_oneword_once() schedule() bug2: do_write_buffer() xip_disable() local_irq_disable() do_write_buffer_wait() schedule() bug3: do_erase_chip() xip_disable() local_irq_disable() schedule() bug4: do_erase_oneblock() xip_disable() local_irq_disable() schedule() Fixes: 02b15e343aee ("[MTD] XIP for AMD CFI flash.") Cc: sta...@vger.kernel.org # v2.6.13 Signed-off-by: Xiaoming Ni --- drivers/mtd/chips/cfi_cmdset_0002.c | 16 1 file changed, 16 insertions(+) diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c index a1f3e1031c3d..12c3776f093a 100644 --- a/drivers/mtd/chips/cfi_cmdset_0002.c +++ b/drivers/mtd/chips/cfi_cmdset_0002.c @@ -1682,7 +1682,11 @@ static int __xipram do_write_oneword_once(struct map_info *map, set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(>wq, ); mutex_unlock(>mutex); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_enable(); schedule(); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_disable(); remove_wait_queue(>wq, ); timeo = jiffies + (HZ / 2); /* FIXME */ mutex_lock(>mutex); @@ -1962,7 +1966,11 @@ static int __xipram do_write_buffer_wait(struct map_info *map, set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(>wq, ); mutex_unlock(>mutex); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_enable(); schedule(); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_disable(); remove_wait_queue(>wq, ); timeo = jiffies + (HZ / 2); /* FIXME */ mutex_lock(>mutex); @@ -2461,7 +2469,11 @@ static int __xipram do_erase_chip(struct map_info *map, struct flchip *chip) set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(>wq, ); mutex_unlock(>mutex); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_enable(); schedule(); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_disable(); remove_wait_queue(>wq, ); mutex_lock(>mutex); continue; @@ -2560,7 +2572,11 @@ static int __xipram do_erase_oneblock(struct map_info *map, struct flchip *chip, set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(>wq, ); mutex_unlock(>mutex); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_enable(); schedule(); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_disable(); remove_wait_queue(>wq, ); mutex_lock(>mutex); continue;
[PATCH] mtd:cfi_cmdset_0002: fix atomic sleep bug when CONFIG_MTD_XIP=y
When CONFIG_MTD_XIP=y, local_irq_disable() is called in xip_disable(). To avoid sleep in interrupt context, we need to call local_irq_enable() before schedule(). The problem call stack is as follows: bug1: do_write_oneword_retry() xip_disable() local_irq_disable() do_write_oneword_once() schedule() bug2: do_write_buffer() xip_disable() local_irq_disable() do_write_buffer_wait() schedule() bug3: do_erase_chip() xip_disable() local_irq_disable() schedule() bug4: do_erase_oneblock() xip_disable() local_irq_disable() schedule() Fixes: 02b15e343aee ("[MTD] XIP for AMD CFI flash.") Cc: sta...@vger.kernel.org # v2.6.13 Signed-off-by: Xiaoming Ni --- drivers/mtd/chips/cfi_cmdset_0002.c | 16 1 file changed, 16 insertions(+) diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c index a1f3e1031c3d..12c3776f093a 100644 --- a/drivers/mtd/chips/cfi_cmdset_0002.c +++ b/drivers/mtd/chips/cfi_cmdset_0002.c @@ -1682,7 +1682,11 @@ static int __xipram do_write_oneword_once(struct map_info *map, set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(>wq, ); mutex_unlock(>mutex); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_enable(); schedule(); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_disable(); remove_wait_queue(>wq, ); timeo = jiffies + (HZ / 2); /* FIXME */ mutex_lock(>mutex); @@ -1962,7 +1966,11 @@ static int __xipram do_write_buffer_wait(struct map_info *map, set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(>wq, ); mutex_unlock(>mutex); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_enable(); schedule(); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_disable(); remove_wait_queue(>wq, ); timeo = jiffies + (HZ / 2); /* FIXME */ mutex_lock(>mutex); @@ -2461,7 +2469,11 @@ static int __xipram do_erase_chip(struct map_info *map, struct flchip *chip) set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(>wq, ); mutex_unlock(>mutex); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_enable(); schedule(); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_disable(); remove_wait_queue(>wq, ); mutex_lock(>mutex); continue; @@ -2560,7 +2572,11 @@ static int __xipram do_erase_oneblock(struct map_info *map, struct flchip *chip, set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(>wq, ); mutex_unlock(>mutex); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_enable(); schedule(); + if (IS_ENABLED(CONFIG_MTD_XIP)) + local_irq_disable(); remove_wait_queue(>wq, ); mutex_lock(>mutex); continue; -- 2.27.0
ping //Re: [PATCH v2] arm:traps: Don't print stack or raw PC/LR hex values in backtraces
ping On 2020/10/16 10:31, Xiaoming Ni wrote: Printing raw pointer values in backtraces has potential security implications and are of questionable value anyway. This patch follows x86 and arm64's lead and removes the "Exception stack:" dump from kernel backtraces: commit a25ffd3a6302a6 ("arm64: traps: Don't print stack or raw PC/LR values in backtraces") commit bb5e5ce545f203 ("x86/dumpstack: Remove kernel text addresses from stack dump") Signed-off-by: Xiaoming Ni --- v2: Delete [] from the stack according to the email discussion in patch V1, Other information processing will be discussed in subsequent patches. v1: https://lore.kernel.org/lkml/20201009075957.110017-1-nixiaom...@huawei.com/ 1. Don't print stack or raw PC/LR hex values in backtraces 2. Don't print stack mem in backtraces 3. if (!panic_on_oops), Don't print stack mem in __die() --- arch/arm/kernel/process.c | 3 +-- arch/arm/kernel/traps.c | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 8e6ace03e960..71c9e5597d39 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -121,8 +121,7 @@ void __show_regs(struct pt_regs *regs) printk("PC is at %pS\n", (void *)instruction_pointer(regs)); printk("LR is at %pS\n", (void *)regs->ARM_lr); - printk("pc : [<%08lx>]lr : [<%08lx>]psr: %08lx\n", - regs->ARM_pc, regs->ARM_lr, regs->ARM_cpsr); + printk("psr: %08lx\n", regs->ARM_cpsr); printk("sp : %08lx ip : %08lx fp : %08lx\n", regs->ARM_sp, regs->ARM_ip, regs->ARM_fp); printk("r10: %08lx r9 : %08lx r8 : %08lx\n", diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 17d5a785df28..911bbf164875 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -68,8 +68,8 @@ void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long end = frame + 4 + sizeof(struct pt_regs); #ifdef CONFIG_KALLSYMS - printk("%s[<%08lx>] (%ps) from [<%08lx>] (%pS)\n", - loglvl, where, (void *)where, from, (void *)from); + printk("%s (%ps) from (%pS)\n", + loglvl, (void *)where, (void *)from); #else printk("%sFunction entered at [<%08lx>] from [<%08lx>]\n", loglvl, where, from);
[PATCH] arm:traps:Don't dump the memory in non-system reset scenarios
Do not dump the memory in non-system reset scenarios to prevent virtual address information leakage. This patch follows x86 and arm64's lead and removes the "Exception stack:" dump from kernel backtraces: commit a25ffd3a6302a6 ("arm64: traps: Don't print stack or raw PC/LR values in backtraces") commit 0ee1dd9f5e7eae ("x86/dumpstack: Remove raw stack dump") Signed-off-by: Xiaoming Ni --- arch/arm/kernel/traps.c | 13 ++--- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 911bbf164875..34e268378972 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -60,13 +60,9 @@ static int __init user_debug_setup(char *str) __setup("user_debug=", user_debug_setup); #endif -static void dump_mem(const char *, const char *, unsigned long, unsigned long); - void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long frame, const char *loglvl) { - unsigned long end = frame + 4 + sizeof(struct pt_regs); - #ifdef CONFIG_KALLSYMS printk("%s (%ps) from (%pS)\n", loglvl, (void *)where, (void *)from); @@ -74,9 +70,6 @@ void dump_backtrace_entry(unsigned long where, unsigned long from, printk("%sFunction entered at [<%08lx>] from [<%08lx>]\n", loglvl, where, from); #endif - - if (in_entry_text(from) && end <= ALIGN(frame, THREAD_SIZE)) - dump_mem(loglvl, "Exception stack", frame + 4, end); } void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl) @@ -125,6 +118,12 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom, mm_segment_t fs; int i; + /* +* To prevent virtual address information leakage, memory +* information cannot be printed in non-reset scenarios. +*/ + if (panic_on_oops == 0) + return; /* * We need to switch to kernel mode so that we can use __get_user * to safely read from kernel space. Note that we now dump the -- 2.27.0
[PATCH v2] arm:traps: Don't print stack or raw PC/LR hex values in backtraces
Printing raw pointer values in backtraces has potential security implications and are of questionable value anyway. This patch follows x86 and arm64's lead and removes the "Exception stack:" dump from kernel backtraces: commit a25ffd3a6302a6 ("arm64: traps: Don't print stack or raw PC/LR values in backtraces") commit bb5e5ce545f203 ("x86/dumpstack: Remove kernel text addresses from stack dump") Signed-off-by: Xiaoming Ni --- v2: Delete [] from the stack according to the email discussion in patch V1, Other information processing will be discussed in subsequent patches. v1: https://lore.kernel.org/lkml/20201009075957.110017-1-nixiaom...@huawei.com/ 1. Don't print stack or raw PC/LR hex values in backtraces 2. Don't print stack mem in backtraces 3. if (!panic_on_oops), Don't print stack mem in __die() --- arch/arm/kernel/process.c | 3 +-- arch/arm/kernel/traps.c | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 8e6ace03e960..71c9e5597d39 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -121,8 +121,7 @@ void __show_regs(struct pt_regs *regs) printk("PC is at %pS\n", (void *)instruction_pointer(regs)); printk("LR is at %pS\n", (void *)regs->ARM_lr); - printk("pc : [<%08lx>]lr : [<%08lx>]psr: %08lx\n", - regs->ARM_pc, regs->ARM_lr, regs->ARM_cpsr); + printk("psr: %08lx\n", regs->ARM_cpsr); printk("sp : %08lx ip : %08lx fp : %08lx\n", regs->ARM_sp, regs->ARM_ip, regs->ARM_fp); printk("r10: %08lx r9 : %08lx r8 : %08lx\n", diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 17d5a785df28..911bbf164875 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -68,8 +68,8 @@ void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long end = frame + 4 + sizeof(struct pt_regs); #ifdef CONFIG_KALLSYMS - printk("%s[<%08lx>] (%ps) from [<%08lx>] (%pS)\n", - loglvl, where, (void *)where, from, (void *)from); + printk("%s (%ps) from (%pS)\n", + loglvl, (void *)where, (void *)from); #else printk("%sFunction entered at [<%08lx>] from [<%08lx>]\n", loglvl, where, from); -- 2.27.0
Re: [PATCH] arm:traps: Don't print stack or raw PC/LR values in backtraces
On 2020/10/12 5:32, Russell King - ARM Linux admin wrote: On Fri, Oct 09, 2020 at 10:18:20AM +0200, Sebastian Andrzej Siewior wrote: On 2020-10-09 09:08:50 [+0100], Russell King - ARM Linux admin wrote: I am really not happy about this - it hurts at least my ability to debug the kernel when people post oopses to the mailing list. If people wish to make the kernel harder to debug, and are prepared to be told "your kernel is undebuggable" then this patch is fine. I haven't look at the patch but don't they keep/add the representation: PC: symbol+offset/size LR: symbol+offset/size ? This is needed at very least as a replacement for the missing address. I don't have a problem getting rid of the hex numbers in [< >] although then I will need to convert the symbol back to an address using the vmlinux to then calculate its address to then find the appropriate place in the objdump output - because objdump does _not_ use the symbol+offset annotation. Yes, I really do look up the numeric addresses in the objdump output to then read the disassembly. $ objdump -d vmlinux | less and then search for the address is the fastest and most convenient way for me rather than having to deal with some random script. Maybe I'm just antequated about how I do my debugging, but this seems to me to be the most efficient and fastest way. The loading address of the kernel module is not fixed, so symbol+offset is more useful than a hexadecimal address when the call stack contains kernel module symbols. Delete the PC/LR address and retain the sysbol+offset. The kernel can still be debugged. Thanks Xiaoming Ni
Re: [PATCH] arm:traps: Don't print stack or raw PC/LR values in backtraces
On 2020/10/9 16:18, Sebastian Andrzej Siewior wrote: On 2020-10-09 09:08:50 [+0100], Russell King - ARM Linux admin wrote: I am really not happy about this - it hurts at least my ability to debug the kernel when people post oopses to the mailing list. If In the reset scenario, dump_mem is retained: @@ -125,6 +118,9 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom, mm_segment_t fs; int i; + /* Do not print virtual addresses in non-reset scenarios */ + if (!panic_on_oops) + return; people wish to make the kernel harder to debug, and are prepared to be told "your kernel is undebuggable" then this patch is fine. I haven't look at the patch but don't they keep/add the representation: PC: symbol+offset/size LR: symbol+offset/size ? This is needed at very least as a replacement for the missing address. Yes, only %08lx was deleted, but %ps is still retained. - printk("%s[<%08lx>] (%ps) from [<%08lx>] (%pS)\n", - loglvl, where, (void *)where, from, (void *)from); + printk("%s (%ps) from (%pS)\n", + loglvl, (void *)where, (void *)from); Thanks Xiaoming Ni
[PATCH] efi:mokvar-table: fix build error
drivers/firmware/efi/mokvar-table.c:139:5: error: implicit declaration of function 'early_memunmap'; did you mean 'devm_memunmap'? [-Werror=implicit-function-declaration] drivers/firmware/efi/mokvar-table.c:148:9: error: implicit declaration of function 'early_memremap'; did you mean 'devm_memremap'? [-Werror=implicit-function-declaration] add #include to fix this build error Signed-off-by: Xiaoming Ni --- drivers/firmware/efi/mokvar-table.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/firmware/efi/mokvar-table.c b/drivers/firmware/efi/mokvar-table.c index 72a9e1736fef..c23d3686f1c7 100644 --- a/drivers/firmware/efi/mokvar-table.c +++ b/drivers/firmware/efi/mokvar-table.c @@ -39,6 +39,7 @@ #include #include #include +#include /* * The LINUX_EFI_MOK_VARIABLE_TABLE_GUID config table is a packed -- 2.27.0
[PATCH] arm:traps: Don't print stack or raw PC/LR values in backtraces
Printing raw pointer values in backtraces has potential security implications and are of questionable value anyway. This patch follows x86 and arm64's lead and removes the "Exception stack:" dump from kernel backtraces: commit a25ffd3a6302a6 ("arm64: traps: Don't print stack or raw PC/LR values in backtraces") commit 0ee1dd9f5e7eae ("x86/dumpstack: Remove raw stack dump") commit bb5e5ce545f203 ("x86/dumpstack: Remove kernel text addresses from stack dump") Signed-off-by: Xiaoming Ni --- arch/arm/kernel/process.c | 3 +-- arch/arm/kernel/traps.c | 12 +--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 8e6ace03e960..71c9e5597d39 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -121,8 +121,7 @@ void __show_regs(struct pt_regs *regs) printk("PC is at %pS\n", (void *)instruction_pointer(regs)); printk("LR is at %pS\n", (void *)regs->ARM_lr); - printk("pc : [<%08lx>]lr : [<%08lx>]psr: %08lx\n", - regs->ARM_pc, regs->ARM_lr, regs->ARM_cpsr); + printk("psr: %08lx\n", regs->ARM_cpsr); printk("sp : %08lx ip : %08lx fp : %08lx\n", regs->ARM_sp, regs->ARM_ip, regs->ARM_fp); printk("r10: %08lx r9 : %08lx r8 : %08lx\n", diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 17d5a785df28..b0b188e01070 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -60,23 +60,18 @@ static int __init user_debug_setup(char *str) __setup("user_debug=", user_debug_setup); #endif -static void dump_mem(const char *, const char *, unsigned long, unsigned long); - void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long frame, const char *loglvl) { unsigned long end = frame + 4 + sizeof(struct pt_regs); #ifdef CONFIG_KALLSYMS - printk("%s[<%08lx>] (%ps) from [<%08lx>] (%pS)\n", - loglvl, where, (void *)where, from, (void *)from); + printk("%s (%ps) from (%pS)\n", + loglvl, (void *)where, (void *)from); #else printk("%sFunction entered at [<%08lx>] from [<%08lx>]\n", loglvl, where, from); #endif - - if (in_entry_text(from) && end <= ALIGN(frame, THREAD_SIZE)) - dump_mem(loglvl, "Exception stack", frame + 4, end); } void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl) @@ -125,6 +120,9 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom, mm_segment_t fs; int i; + /* Do not print virtual addresses in non-reset scenarios */ + if (!panic_on_oops) + return; /* * We need to switch to kernel mode so that we can use __get_user * to safely read from kernel space. Note that we now dump the -- 2.27.0
Re: Question: Why is there no notification when a file is opened using filp_open()?
On 2020/9/9 11:44, Amir Goldstein wrote: On Tue, Sep 8, 2020 at 8:19 PM Matthew Wilcox wrote: On Tue, Sep 08, 2020 at 04:18:29PM +0300, Amir Goldstein wrote: On Tue, Sep 8, 2020 at 3:53 PM Xiaoming Ni wrote: For example, in fs/coredump.c, do_coredump() calls filp_open() to generate core files. In this scenario, the fsnotify_open() notification is missing. I am not convinced that we should generate an event. You will have to explain in what is the real world use case that requires this event to be generated. Take the typical usage for fsnotify of a graphical file manager. It would be nice if the file manager showed a corefile as soon as it appeared in a directory rather than waiting until some other operation in that directory caused those directory contents to be refreshed. fsnotify_open() is not the correct notification for file managers IMO. fsnotify_create() is and it will be called in this case. If the reason you are interested in open events is because you want to monitor the entire filesystem then welcome to the future - FAN_CREATE is supported since kernel v5.1. Is there another real life case you have in mind where you think users should be able to get an open fd for a file that the kernel has opened? Because that is what FAN_OPEN will do. There are also cases where file is opened in read-only mode using filp_open(). case1: nfsd4_init_recdir() call filp_open() filp_open() nfsd4_init_recdir() fs/nfsd/nfs4recover.c#L543 L70: static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; L543: nn->rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); case2: ima_read_policy() filp_open() kernel_read_file_from_path() fs/exec.c#L1004 ima_read_policy() security/integrity/ima/ima_fs.c#L286 ima_write_policy() security/integrity/ima/ima_fs.c#L335 ima_measure_policy_ops security/integrity/ima/ima_fs.c#L443 sys_write() case3: use do_file_open_root() to open file do_file_open_root() file_open_root() fs/open.c#L1159 kernel_read_file_from_path_initns() fs/exec.c#L1029 fw_get_filesystem_firmware() drivers/base/firmware_loader/main.c#L498 Do we need to add fsnotify_open() in these scenarios? Thanks Xiaoming Ni
Re: Question: Why is there no notification when a file is opened using filp_open()?
On 2020/9/8 18:06, Amir Goldstein wrote: On Tue, Sep 8, 2020 at 11:02 AM Xiaoming Ni wrote: The file opening action on the system may be from user-mode sys_open() or kernel-mode filp_open(). Currently, fsnotify_open() is invoked in do_sys_openat2(). But filp_open() is not notified. Why? Is this an omission? Do we need to call fsnotify_open() in filp_open() or do_filp_open() to ensure that both user-mode and kernel-mode file opening operations can be notified? Do you have a specific use case of kernel filp_open() in mind? For example, in fs/coredump.c, do_coredump() calls filp_open() to generate core files. In this scenario, the fsnotify_open() notification is missing. Thanks Xiaoming Ni
Question: Why is there no notification when a file is opened using filp_open()?
The file opening action on the system may be from user-mode sys_open() or kernel-mode filp_open(). Currently, fsnotify_open() is invoked in do_sys_openat2(). But filp_open() is not notified. Why? Is this an omission? Do we need to call fsnotify_open() in filp_open() or do_filp_open() to ensure that both user-mode and kernel-mode file opening operations can be notified? Thanks Xiaoming Ni
ping//Re: [PATCH] security: fix some spelling mistakes in the comments by codespell
ping On 2020/8/22 11:05, Xiaoming Ni wrote: ecurity/commoncap.c: capabilties ==> capabilities security/lsm_audit.c: programers ==> programmers security/tomoyo/audit.c: stuct ==> struct security/tomoyo/common.c: Poiter ==> Pointer security/smack/smack_lsm.c: overriden ==> overridden overridden security/smack/smackfs.c: overriden ==> overridden security/integrity/ima/ima_template_lib.c: algoritm ==> algorithm Signed-off-by: Xiaoming Ni --- security/commoncap.c | 2 +- security/integrity/ima/ima_template_lib.c | 2 +- security/lsm_audit.c | 2 +- security/smack/smack_lsm.c| 2 +- security/smack/smackfs.c | 2 +- security/tomoyo/audit.c | 2 +- security/tomoyo/common.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index 59bf3c1674c8..2c3a0f1b6876 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -1046,7 +1046,7 @@ int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags) break; case LSM_SETID_FS: - /* juggle the capabilties to follow FSUID changes, unless + /* juggle the capabilities to follow FSUID changes, unless * otherwise suppressed * * FIXME - is fsuser used for all CAP_FS_MASK capabilities? diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c index c022ee9e2a4e..9513564ee0b2 100644 --- a/security/integrity/ima/ima_template_lib.c +++ b/security/integrity/ima/ima_template_lib.c @@ -231,7 +231,7 @@ static int ima_eventdigest_init_common(const u8 *digest, u32 digestsize, * digest formats: * - DATA_FMT_DIGEST: digest * - DATA_FMT_DIGEST_WITH_ALGO: [] + ':' + '\0' + digest, -*where is provided if the hash algoritm is not +*where is provided if the hash algorithm is not *SHA1 or MD5 */ u8 buffer[CRYPTO_MAX_ALG_NAME + 2 + IMA_MAX_DIGEST_SIZE] = { 0 }; diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 53d0d183db8f..a0ff2e441069 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -212,7 +212,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, char comm[sizeof(current->comm)]; /* -* To keep stack sizes in check force programers to notice if they +* To keep stack sizes in check force programmers to notice if they * start making this union too large! See struct lsm_network_audit * as an example of how to deal with large data. */ diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 8c0893eb5aa8..960d4641239c 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1784,7 +1784,7 @@ static int smack_file_send_sigiotask(struct task_struct *tsk, */ file = container_of(fown, struct file, f_owner); - /* we don't log here as rc can be overriden */ + /* we don't log here as rc can be overridden */ blob = smack_file(file); skp = *blob; rc = smk_access(skp, tkp, MAY_DELIVER, NULL); diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 9c4308077574..5c581ec814ac 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -111,7 +111,7 @@ struct smack_known *smack_syslog_label; /* * Ptrace current rule * SMACK_PTRACE_DEFAULTregular smack ptrace rules (/proc based) - * SMACK_PTRACE_EXACT labels must match, but can be overriden with + * SMACK_PTRACE_EXACT labels must match, but can be overridden with * CAP_SYS_PTRACE * SMACK_PTRACE_DRACONIAN lables must match, CAP_SYS_PTRACE has no effect */ diff --git a/security/tomoyo/audit.c b/security/tomoyo/audit.c index 3c96e8402e94..b51bad121c11 100644 --- a/security/tomoyo/audit.c +++ b/security/tomoyo/audit.c @@ -311,7 +311,7 @@ static LIST_HEAD(tomoyo_log); /* Lock for "struct list_head tomoyo_log". */ static DEFINE_SPINLOCK(tomoyo_log_lock); -/* Length of "stuct list_head tomoyo_log". */ +/* Length of "struct list_head tomoyo_log". */ static unsigned int tomoyo_log_count; /** diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index 4bee32bfe16d..38bdc0ffc312 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -2608,7 +2608,7 @@ ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer, /** * tomoyo_parse_policy - Parse a policy line. * - * @head: Poiter to "struct tomoyo_io_buffer". + * @head: Pointer to "struct tomoyo_io_buffer". * @line: Line to parse. * * Returns 0 on success, negative value otherwise.
[PATCH] arm64: fix some spelling mistakes in the comments by codespell
arch/arm64/include/asm/cpu_ops.h:24: necesary ==> necessary arch/arm64/include/asm/kvm_arm.h:69: maintainance ==> maintenance arch/arm64/include/asm/cpufeature.h:361: capabilties ==> capabilities arch/arm64/kernel/perf_regs.c:19: compatability ==> compatibility arch/arm64/kernel/smp_spin_table.c:86: endianess ==> endianness arch/arm64/kernel/smp_spin_table.c:88: endianess ==> endianness arch/arm64/kvm/vgic/vgic-mmio-v3.c:1004: targetting ==> targeting arch/arm64/kvm/vgic/vgic-mmio-v3.c:1005: targetting ==> targeting Signed-off-by: Xiaoming Ni --- arch/arm64/include/asm/cpu_ops.h| 2 +- arch/arm64/include/asm/cpufeature.h | 2 +- arch/arm64/include/asm/kvm_arm.h| 2 +- arch/arm64/kernel/perf_regs.c | 2 +- arch/arm64/kernel/smp_spin_table.c | 4 ++-- arch/arm64/kvm/vgic/vgic-mmio-v3.c | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/cpu_ops.h b/arch/arm64/include/asm/cpu_ops.h index d28e8f37d3b4..e95c4df83911 100644 --- a/arch/arm64/include/asm/cpu_ops.h +++ b/arch/arm64/include/asm/cpu_ops.h @@ -21,7 +21,7 @@ * mechanism for doing so, tests whether it is possible to boot * the given CPU. * @cpu_boot: Boots a cpu into the kernel. - * @cpu_postboot: Optionally, perform any post-boot cleanup or necesary + * @cpu_postboot: Optionally, perform any post-boot cleanup or necessary * synchronisation. Called from the cpu being booted. * @cpu_can_disable: Determines whether a CPU can be disabled based on * mechanism-specific information. diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 89b4f0142c28..3a42dc8e697c 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -358,7 +358,7 @@ static inline int cpucap_default_scope(const struct arm64_cpu_capabilities *cap) } /* - * Generic helper for handling capabilties with multiple (match,enable) pairs + * Generic helper for handling capabilities with multiple (match,enable) pairs * of call backs, sharing the same capability bit. * Iterate over each entry to see if at least one matches. */ diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 51c1d9918999..21f91aebc052 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -66,7 +66,7 @@ * TWI:Trap WFI * TIDCP: Trap L2CTLR/L2ECTLR * BSU_IS: Upgrade barriers to the inner shareable domain - * FB: Force broadcast of all maintainance operations + * FB: Force broadcast of all maintenance operations * AMO:Override CPSR.A and enable signaling with VA * IMO:Override CPSR.I and enable signaling with VI * FMO:Override CPSR.F and enable signaling with VF diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c index 666b225aeb3a..94e8718e7229 100644 --- a/arch/arm64/kernel/perf_regs.c +++ b/arch/arm64/kernel/perf_regs.c @@ -16,7 +16,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx) /* * Our handling of compat tasks (PERF_SAMPLE_REGS_ABI_32) is weird, but -* we're stuck with it for ABI compatability reasons. +* we're stuck with it for ABI compatibility reasons. * * For a 32-bit consumer inspecting a 32-bit task, then it will look at * the first 16 registers (see arch/arm/include/uapi/asm/perf_regs.h). diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c index c8a3fee00c11..5892e79fa429 100644 --- a/arch/arm64/kernel/smp_spin_table.c +++ b/arch/arm64/kernel/smp_spin_table.c @@ -83,9 +83,9 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu) /* * We write the release address as LE regardless of the native -* endianess of the kernel. Therefore, any boot-loaders that +* endianness of the kernel. Therefore, any boot-loaders that * read this address need to convert this address to the -* boot-loader's endianess before jumping. This is mandated by +* boot-loader's endianness before jumping. This is mandated by * the boot protocol. */ writeq_relaxed(__pa_symbol(secondary_holding_pen), release_addr); diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index 5c786b915cd3..52d6f24f65dc 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -1001,8 +1001,8 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1) raw_spin_lock_irqsave(>irq_lock, flags); /* -* An access targetting Group0 SGIs can only generate -* those, while an access targetting Group1 SGIs can +* An access targeting Group0 SGIs can only generate +* those, while an acces
[PATCH] security: fix some spelling mistakes in the comments by codespell
ecurity/commoncap.c: capabilties ==> capabilities security/lsm_audit.c: programers ==> programmers security/tomoyo/audit.c: stuct ==> struct security/tomoyo/common.c: Poiter ==> Pointer security/smack/smack_lsm.c: overriden ==> overridden overridden security/smack/smackfs.c: overriden ==> overridden security/integrity/ima/ima_template_lib.c: algoritm ==> algorithm Signed-off-by: Xiaoming Ni --- security/commoncap.c | 2 +- security/integrity/ima/ima_template_lib.c | 2 +- security/lsm_audit.c | 2 +- security/smack/smack_lsm.c| 2 +- security/smack/smackfs.c | 2 +- security/tomoyo/audit.c | 2 +- security/tomoyo/common.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index 59bf3c1674c8..2c3a0f1b6876 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -1046,7 +1046,7 @@ int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags) break; case LSM_SETID_FS: - /* juggle the capabilties to follow FSUID changes, unless + /* juggle the capabilities to follow FSUID changes, unless * otherwise suppressed * * FIXME - is fsuser used for all CAP_FS_MASK capabilities? diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c index c022ee9e2a4e..9513564ee0b2 100644 --- a/security/integrity/ima/ima_template_lib.c +++ b/security/integrity/ima/ima_template_lib.c @@ -231,7 +231,7 @@ static int ima_eventdigest_init_common(const u8 *digest, u32 digestsize, * digest formats: * - DATA_FMT_DIGEST: digest * - DATA_FMT_DIGEST_WITH_ALGO: [] + ':' + '\0' + digest, -*where is provided if the hash algoritm is not +*where is provided if the hash algorithm is not *SHA1 or MD5 */ u8 buffer[CRYPTO_MAX_ALG_NAME + 2 + IMA_MAX_DIGEST_SIZE] = { 0 }; diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 53d0d183db8f..a0ff2e441069 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -212,7 +212,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, char comm[sizeof(current->comm)]; /* -* To keep stack sizes in check force programers to notice if they +* To keep stack sizes in check force programmers to notice if they * start making this union too large! See struct lsm_network_audit * as an example of how to deal with large data. */ diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 8c0893eb5aa8..960d4641239c 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1784,7 +1784,7 @@ static int smack_file_send_sigiotask(struct task_struct *tsk, */ file = container_of(fown, struct file, f_owner); - /* we don't log here as rc can be overriden */ + /* we don't log here as rc can be overridden */ blob = smack_file(file); skp = *blob; rc = smk_access(skp, tkp, MAY_DELIVER, NULL); diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 9c4308077574..5c581ec814ac 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -111,7 +111,7 @@ struct smack_known *smack_syslog_label; /* * Ptrace current rule * SMACK_PTRACE_DEFAULTregular smack ptrace rules (/proc based) - * SMACK_PTRACE_EXACT labels must match, but can be overriden with + * SMACK_PTRACE_EXACT labels must match, but can be overridden with *CAP_SYS_PTRACE * SMACK_PTRACE_DRACONIAN lables must match, CAP_SYS_PTRACE has no effect */ diff --git a/security/tomoyo/audit.c b/security/tomoyo/audit.c index 3c96e8402e94..b51bad121c11 100644 --- a/security/tomoyo/audit.c +++ b/security/tomoyo/audit.c @@ -311,7 +311,7 @@ static LIST_HEAD(tomoyo_log); /* Lock for "struct list_head tomoyo_log". */ static DEFINE_SPINLOCK(tomoyo_log_lock); -/* Length of "stuct list_head tomoyo_log". */ +/* Length of "struct list_head tomoyo_log". */ static unsigned int tomoyo_log_count; /** diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index 4bee32bfe16d..38bdc0ffc312 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -2608,7 +2608,7 @@ ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer, /** * tomoyo_parse_policy - Parse a policy line. * - * @head: Poiter to "struct tomoyo_io_buffer". + * @head: Pointer to "struct tomoyo_io_buffer". * @line: Line to parse. * * Returns 0 on success, negative value otherwise. -- 2.27.0
[PATCH v2] s390: fix build error for sys_call_table_emu
Build error on s390: arch/s390/kernel/entry.o: in function `sys_call_table_emu': >> (.rodata+0x1288): undefined reference to `__s390_' In commit ("All arch: remove system call sys_sysctl") 148 commonfdatasync sys_fdatasync sys_fdatasync -149 common_sysctl sys_sysctl compat_sys_sysctl +149 common_sysctl sys_ni_syscall 150 commonmlock sys_mlock sys_mlock After the patch is integrated, there is a format error in the generated arch/s390/include/generated/asm/syscall_table.h: SYSCALL(sys_fdatasync, sys_fdatasync) SYSCALL(sys_ni_syscall,) /* cause build error */ SYSCALL(sys_mlock,sys_mlock) According to the guidance of Heiko Carstens, use "-" to fill the empty system call Similarly, modify tools/perf/arch/s390/entry/syscalls/syscall.tbl. Fixes: ("All arch: remove system call sys_sysctl") Fixes: https://lore.kernel.org/linuxppc-dev/20200616030734.87257-1-nixiaom...@huawei.com/ Reported-by: kernel test robot Signed-off-by: Xiaoming Ni changes in v2: use "-" to fill the empty system call v1: https://lore.kernel.org/lkml/20200618110320.104013-1-nixiaom...@huawei.com/ --- arch/s390/kernel/syscalls/syscall.tbl | 2 +- tools/perf/arch/s390/entry/syscalls/syscall.tbl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index f17aaf6fe5de..04c34c2ed916 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -138,7 +138,7 @@ 146 commonwritev sys_writev compat_sys_writev 147 commongetsid sys_getsid sys_getsid 148 commonfdatasync sys_fdatasync sys_fdatasync -149 common_sysctl sys_ni_syscall +149 common_sysctl - - 150 commonmlock sys_mlock sys_mlock 151 commonmunlock sys_munlock sys_munlock 152 commonmlockallsys_mlockall sys_mlockall diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index 0193f9b98753..29144b79a49d 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -138,7 +138,7 @@ 146 commonwritev sys_writev compat_sys_writev 147 commongetsid sys_getsid sys_getsid 148 commonfdatasync sys_fdatasync sys_fdatasync -149 common_sysctl sys_ni_syscall +149 common_sysctl - - 150 commonmlock sys_mlock compat_sys_mlock 151 commonmunlock sys_munlock compat_sys_munlock 152 commonmlockallsys_mlockall sys_mlockall -- 2.27.0
Re: [PATCH] s390: fix build error for sys_call_table_emu
On 2020/6/18 19:27, Heiko Carstens wrote: On Thu, Jun 18, 2020 at 07:03:20PM +0800, Xiaoming Ni wrote: Build error on s390: arch/s390/kernel/entry.o: in function `sys_call_table_emu': >> (.rodata+0x1288): undefined reference to `__s390_' In commit ("All arch: remove system call sys_sysctl") 148 common fdatasync sys_fdatasync sys_fdatasync -149 common_sysctl sys_sysctl compat_sys_sysctl +149 common_sysctl sys_ni_syscall 150 common mlock sys_mlock sys_mlock After the patch is integrated, there is a format error in the generated arch/s390/include/generated/asm/syscall_table.h: SYSCALL(sys_fdatasync, sys_fdatasync) SYSCALL(sys_ni_syscall,) /* cause build error */ SYSCALL(sys_mlock,sys_mlock) There are holes in the system call number in arch/s390/kernel/syscalls/syscall.tbl. When generating syscall_table.h, these hole numbers will be automatically filled with "NI_SYSCALL". Therefore, delete the number 149 to fix the current compilation failure. Similarly, modify tools/perf/arch/s390/entry/syscalls/syscall.tbl. Fixes: ("All arch: remove system call sys_sysctl") Fixes: https://lore.kernel.org/linuxppc-dev/20200616030734.87257-1-nixiaom...@huawei.com/ Reported-by: kernel test robot Signed-off-by: Xiaoming Ni --- arch/s390/kernel/syscalls/syscall.tbl | 1 - tools/perf/arch/s390/entry/syscalls/syscall.tbl | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index f17aaf6fe5de..bcaf93994e3c 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -138,7 +138,6 @@ 146 common writev sys_writev compat_sys_writev 147 common getsid sys_getsid sys_getsid 148 common fdatasync sys_fdatasync sys_fdatasync -149 common_sysctl sys_ni_syscall This is not correct. It should be changed to: 149 common _sysctl - - thanks for your guidance Otherwise the generated __NR__sysctl define will be lost from unistd.h, which should not happen. Looking at the link above it _looks_ like a similar mistake was done for arm64. Using holes will cause the definition of __NR__sysctl to be missing in include/asm/unistd_32.h and include/asm/unistd_64.h For arm64, I observed that "sys_afs_syscall", "sys_get_kernel_syms" and other commented out syscalls have no corresponding definition _NR_XXX in unistd.h, is it not a problem on arm64? /* 127 was sys_create_module */ __SYSCALL(127, sys_ni_syscall) /* 130 was sys_get_kernel_syms */ __SYSCALL(130, sys_ni_syscall) /* 137 was sys_afs_syscall */ __SYSCALL(137, sys_ni_syscall) Thanks Xiaoming Ni
[PATCH] s390: fix build error for sys_call_table_emu
Build error on s390: arch/s390/kernel/entry.o: in function `sys_call_table_emu': >> (.rodata+0x1288): undefined reference to `__s390_' In commit ("All arch: remove system call sys_sysctl") 148 commonfdatasync sys_fdatasync sys_fdatasync -149 common_sysctl sys_sysctl compat_sys_sysctl +149 common_sysctl sys_ni_syscall 150 commonmlock sys_mlock sys_mlock After the patch is integrated, there is a format error in the generated arch/s390/include/generated/asm/syscall_table.h: SYSCALL(sys_fdatasync, sys_fdatasync) SYSCALL(sys_ni_syscall,) /* cause build error */ SYSCALL(sys_mlock,sys_mlock) There are holes in the system call number in arch/s390/kernel/syscalls/syscall.tbl. When generating syscall_table.h, these hole numbers will be automatically filled with "NI_SYSCALL". Therefore, delete the number 149 to fix the current compilation failure. Similarly, modify tools/perf/arch/s390/entry/syscalls/syscall.tbl. Fixes: ("All arch: remove system call sys_sysctl") Fixes: https://lore.kernel.org/linuxppc-dev/20200616030734.87257-1-nixiaom...@huawei.com/ Reported-by: kernel test robot Signed-off-by: Xiaoming Ni --- arch/s390/kernel/syscalls/syscall.tbl | 1 - tools/perf/arch/s390/entry/syscalls/syscall.tbl | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index f17aaf6fe5de..bcaf93994e3c 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -138,7 +138,6 @@ 146 commonwritev sys_writev compat_sys_writev 147 commongetsid sys_getsid sys_getsid 148 commonfdatasync sys_fdatasync sys_fdatasync -149 common_sysctl sys_ni_syscall 150 commonmlock sys_mlock sys_mlock 151 commonmunlock sys_munlock sys_munlock 152 commonmlockallsys_mlockall sys_mlockall diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index 0193f9b98753..eb77d0d01d8f 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -138,7 +138,6 @@ 146 commonwritev sys_writev compat_sys_writev 147 commongetsid sys_getsid sys_getsid 148 commonfdatasync sys_fdatasync sys_fdatasync -149 common_sysctl sys_ni_syscall 150 commonmlock sys_mlock compat_sys_mlock 151 commonmunlock sys_munlock compat_sys_munlock 152 commonmlockallsys_mlockall sys_mlockall -- 2.27.0
Re: [PATCH 0/3] Convert nsproxy, groups, and creds to refcount_t
On 2020/6/13 2:34, Kees Cook wrote: This series was never applied[1], and was recently pointed out as missing[2]. If someone has a tree for this, please take it. Otherwise, please Ack and I'll send it to Linus. Thanks! -Kees [1] https://lore.kernel.org/lkml/20190306110549.7628-1-elena.reshet...@intel.com/ [2] https://lore.kernel.org/lkml/1591957695-118312-1-git-send-email-nixiaom...@huawei.com/ Elena Reshetova (3): nsproxy: convert nsproxy.count to refcount_t groups: convert group_info.usage to refcount_t creds: convert cred.usage to refcount_t include/linux/cred.h| 15 +++--- include/linux/nsproxy.h | 7 +++ kernel/cred.c | 44 - kernel/groups.c | 2 +- kernel/nsproxy.c| 6 +++--- net/sunrpc/auth.c | 2 +- 6 files changed, 38 insertions(+), 38 deletions(-) Should mm->mm_users also be replaced by refcount_t? In addition, is it better to change all variables that use atomic_dec_and_test to control the release process to refconut_t? Thanks Xiaoming Ni
Re: [PATCH 3/3] creds: convert cred.usage to refcount_t
On 2020/6/13 2:34, Kees Cook wrote: From: Elena Reshetova atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable cred.usage is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts.Please check Documentation/core-api/refcount-vs-atomic.rst for more information. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the cred.usage it might make a difference in following places: - get_task_cred(): increment in refcount_inc_not_zero() only guarantees control dependency on success vs. fully ordered atomic counterpart - put_cred(): decrement in refcount_dec_and_test() only provides RELEASE ordering and ACQUIRE ordering on success vs. fully ordered atomic counterpart Suggested-by: Kees Cook Signed-off-by: Elena Reshetova Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Link: https://lore.kernel.org/r/20190306110549.7628-4-elena.reshet...@intel.com Signed-off-by: Kees Cook Currently this patch is better than my RFC patch Looks good to me. Thanks Xiaoming Ni
[PATCH RFC] cred: Add WARN to detect wrong use of get/put_cred
Cred release and usage check code flow: 1. put_cred() if (atomic_dec_and_test(&(cred)->usage)) __put_cred(cred); 2. __put_cred() BUG_ON(atomic_read(>usage) != 0); call_rcu(>rcu, put_cred_rcu); 3. put_cred_rcu() if (atomic_read(>usage) != 0) panic("CRED: put_cred_rcu() sees %p with usage %d\n", cred, atomic_read(>usage)); kmem_cache_free(cred_jar, cred); If panic is triggered on put_cred_rcu(), there are two possibilities 1. Call get_cred() after __put_cred(), usage > 0 2. Call put_cred() after __put_cred(), usage < 0 Since put_cred_rcu is an asynchronous behavior, it is no longer the first scene when panic, there is no information about the murderer in the panic call stack... So, add WARN() in get_cred()/put_cred(), and pray to catch the murderer at the first scene. Signed-off-by: Xiaoming Ni --- include/linux/cred.h | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/include/linux/cred.h b/include/linux/cred.h index 18639c0..c00d5a1 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -224,11 +224,16 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred) * * Get a reference on the specified set of new credentials. The caller must * release the reference. + * + * Initialize usage to 1 during cred resource allocation, + * so when calling get_cred, usage cannot be 0. */ static inline struct cred *get_new_cred(struct cred *cred) { - atomic_inc(>usage); - return cred; + if (atomic_inc_not_zero(>usage)) + return cred; + WARN(1, "get_new_cred after __put_cred"); + return NULL; } /** @@ -280,11 +285,14 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred) static inline void put_cred(const struct cred *_cred) { struct cred *cred = (struct cred *) _cred; + int usage; if (cred) { validate_creds(cred); - if (atomic_dec_and_test(&(cred)->usage)) + usage = atomic_dec_return(&(cred)->usage); + if (usage == 0) __put_cred(cred); + WARN(usage < 0, "put_cred after __put_cred"); } } -- 1.8.5.6
Re: [PATCH] sysctl: Delete the code of sys_sysctl
On 2020/6/10 3:20, Eric W. Biederman wrote: Xiaoming Ni writes: Since the commit 61a47c1ad3a4dc ("sysctl: Remove the sysctl system call"), sys_sysctl has lost its actual role: any input can only return an error. The remaining code does have a role. It reports programs that attempt to use the removed sysctl. It would help if your change description had a reason why we don't want to warn people that a program has used a removed system call. Probably something like: We have been warning about people using the sysctl system call for years and believe there are no more users. Even if there are users of this interface if they have not complained or fixed their code by now they probably are not going to, so there is no point in warning them any longer. With a change like that made to the patch description. Acked-by: "Eric W. Biederman" Thanks for your guidance I will add it in the v2 version Thanks Xiaoming Ni
Re: [PATCH] sysctl: Delete the code of sys_sysctl
On 2020/6/9 23:40, Kees Cook wrote: On Tue, Jun 09, 2020 at 02:20:05PM +0800, Xiaoming Ni wrote: Since the commit 61a47c1ad3a4dc ("sysctl: Remove the sysctl system call"), sys_sysctl has lost its actual role: any input can only return an error. Delete the code and return -ENOSYS directly at the function entry Signed-off-by: Xiaoming Ni Looks right to me. Reviewed-by: Kees Cook Should this be taken a step further and just remove the syscall entirely and update the per-arch tables with the ENOSYS hole? -Kees Searching for git log, I found a commit record that deleted syscall: commit f5b94099739722 ("All Arch: remove linkage for sys_nfsservctl system call"). Could I use sys_ni_syscall to implement the hole as in the example here? E.g: diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 7b3832d..f36fda6 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -162,7 +162,7 @@ 146common writev sys_writev 147common getsid sys_getsid 148common fdatasync sys_fdatasync -149common _sysctl sys_sysctl +149 common _sysctl sys_ni_syscall 150common mlock sys_mlock 151common munlock sys_munlock 152common mlockallsys_mlockall diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index f8dafe9..ca41bb7 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -308,8 +308,8 @@ __SYSCALL(__NR_getsid, sys_getsid) #define __NR_fdatasync 148 __SYSCALL(__NR_fdatasync, sys_fdatasync) -#define __NR__sysctl 149 -__SYSCALL(__NR__sysctl, compat_sys_sysctl) + /* 149 was sys_sysctl */ +__SYSCALL(149, sys_ni_syscall) #define __NR_mlock 150 __SYSCALL(__NR_mlock, sys_mlock) #define __NR_munlock 151 In this case, I need to modify a lot of code in v2. Can I add "Reviewed-by: Kees Cook " to the v2 patch? Thanks Xiaoming Ni
[PATCH] sysctl: Delete the code of sys_sysctl
Since the commit 61a47c1ad3a4dc ("sysctl: Remove the sysctl system call"), sys_sysctl has lost its actual role: any input can only return an error. Delete the code and return -ENOSYS directly at the function entry Signed-off-by: Xiaoming Ni --- kernel/sysctl_binary.c | 146 + 1 file changed, 2 insertions(+), 144 deletions(-) diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 7d550cc..41a88f8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1,126 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 -#include #include -#include "../fs/xfs/xfs_sysctl.h" -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -static ssize_t binary_sysctl(const int *name, int nlen, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -static void deprecated_sysctl_warning(const int *name, int nlen) -{ - int i; - - /* -* CTL_KERN/KERN_VERSION is used by older glibc and cannot -* ever go away. -*/ - if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION) - return; - - if (printk_ratelimit()) { - printk(KERN_INFO - "warning: process `%s' used the deprecated sysctl " - "system call with ", current->comm); - for (i = 0; i < nlen; i++) - printk(KERN_CONT "%d.", name[i]); - printk(KERN_CONT "\n"); - } - return; -} - -#define WARN_ONCE_HASH_BITS 8 -#define WARN_ONCE_HASH_SIZE (1<nlen. */ - if (nlen < 0 || nlen > CTL_MAXNAME) - return -ENOTDIR; - /* Read in the sysctl name for simplicity */ - for (i = 0; i < nlen; i++) - if (get_user(name[i], args_name + i)) - return -EFAULT; - - warn_on_bintable(name, nlen); - - return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); -} - SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) { - struct __sysctl_args tmp; - size_t oldlen = 0; - ssize_t result; - - if (copy_from_user(, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && !tmp.oldlenp) - return -EFAULT; - - if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp)) - return -EFAULT; - - result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen, - tmp.newval, tmp.newlen); - - if (result >= 0) { - oldlen = result; - result = 0; - } - - if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp)) - return -EFAULT; - - return result; + return -ENOSYS; } @@ -138,34 +23,7 @@ struct compat_sysctl_args { COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args) { - struct compat_sysctl_args tmp; - compat_size_t __user *compat_oldlenp; - size_t oldlen = 0; - ssize_t result; - - if (copy_from_user(, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && !tmp.oldlenp) - return -EFAULT; - - compat_oldlenp = compat_ptr(tmp.oldlenp); - if (compat_oldlenp && get_user(oldlen, compat_oldlenp)) - return -EFAULT; - - result = do_sysctl(compat_ptr(tmp.name), tmp.nlen, - compat_ptr(tmp.oldval), oldlen, - compat_ptr(tmp.newval), tmp.newlen); - - if (result >= 0) { - oldlen = result; - result = 0; - } - - if (compat_oldlenp && put_user(oldlen, compat_oldlenp)) - return -EFAULT; - - return result; + return -ENOSYS; } #endif /* CONFIG_COMPAT */ -- 1.8.5.6
[PATCH] ASoC: max98390: fix build warning detected by -Wformat
Fix build warning: sound/soc/codecs/max98390.c:781:3: warning: format '%ld' expects argument of type 'long int', but argument 4 has type 'size_t {aka const unsigned int}' [-Wformat=] Signed-off-by: Xiaoming Ni --- sound/soc/codecs/max98390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/max98390.c b/sound/soc/codecs/max98390.c index b9ce44d..884ee55 100644 --- a/sound/soc/codecs/max98390.c +++ b/sound/soc/codecs/max98390.c @@ -778,7 +778,7 @@ static int max98390_dsm_init(struct snd_soc_component *component) } dev_dbg(component->dev, - "max98390: param fw size %ld\n", + "max98390: param fw size %zu\n", fw->size); dsm_param = (char *)fw->data; dsm_param += MAX98390_DSM_PAYLOAD_OFFSET; -- 1.8.5.6
Re: [PATCH 13/13] fs: move binfmt_misc sysctl to its own file
On 2020/5/29 15:41, Luis Chamberlain wrote: This moves the binfmt_misc sysctl to its own file to help remove clutter from kernel/sysctl.c. Signed-off-by: Luis Chamberlain --- fs/binfmt_misc.c | 1 + kernel/sysctl.c | 7 --- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index f69a043f562b..656b3f5f3bbf 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -821,6 +821,7 @@ static int __init init_misc_binfmt(void) int err = register_filesystem(_fs_type); if (!err) insert_binfmt(_format); + register_sysctl_empty_subdir("fs", "binfmt_misc"); return err; } build error when CONFIG_BINFMT_MISC=m ERROR: modpost: "register_sysctl_empty_subdir" [fs/binfmt_misc.ko] undefined! diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 27f0c9ea..4129dfb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2853,6 +2853,7 @@ void register_sysctl_empty_subdir(const char *base, { register_sysctl_subdir(base, subdir, sysctl_mount_point); } +EXPORT_SYMBOL_GPL(register_sysctl_empty_subdir); #endif /* CONFIG_SYSCTL */ Thanks Xiaoming Ni
Re: [PATCH 11/13] random: simplify sysctl declaration with register_sysctl_subdir()
On 2020/5/29 18:26, Greg KH wrote: On Fri, May 29, 2020 at 07:41:06AM +, Luis Chamberlain wrote: From: Xiaoming Ni Move random_table sysctl from kernel/sysctl.c to drivers/char/random.c and use register_sysctl_subdir() to help remove the clutter out of kernel/sysctl.c. Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain --- drivers/char/random.c | 14 -- include/linux/sysctl.h | 1 - kernel/sysctl.c| 5 - 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index a7cf6aa65908..73fd4b6e9c18 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -2101,8 +2101,7 @@ static int proc_do_entropy(struct ctl_table *table, int write, } static int sysctl_poolsize = INPUT_POOL_WORDS * 32; -extern struct ctl_table random_table[]; -struct ctl_table random_table[] = { +static struct ctl_table random_table[] = { { .procname = "poolsize", .data = _poolsize, @@ -2164,6 +2163,17 @@ struct ctl_table random_table[] = { #endif { } }; + +/* + * rand_initialize() is called before sysctl_init(), + * so we cannot call register_sysctl_init() in rand_initialize() + */ +static int __init random_sysctls_init(void) +{ + register_sysctl_subdir("kernel", "random", random_table); No error checking? :( It was my mistake, I forgot to add a comment here. Same as the comment of register_sysctl_init(), There is almost no failure here during the system initialization phase, and failure in time does not affect the main function. Thanks Xiaoming Ni
Re: [PATCH 09/13] firmware_loader: simplify sysctl declaration with register_sysctl_subdir()
On 2020/5/29 18:26, Greg KH wrote: On Fri, May 29, 2020 at 07:41:04AM +, Luis Chamberlain wrote: From: Xiaoming Ni Move the firmware config sysctl table to fallback_table.c and use the new register_sysctl_subdir() helper. This removes the clutter from kernel/sysctl.c. Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain --- drivers/base/firmware_loader/fallback.c | 4 drivers/base/firmware_loader/fallback.h | 11 ++ drivers/base/firmware_loader/fallback_table.c | 22 +-- include/linux/sysctl.h| 1 - kernel/sysctl.c | 7 -- 5 files changed, 35 insertions(+), 10 deletions(-) So it now takes more lines than the old stuff? :( CONFIG_FW_LOADER = m Before cleaning, no matter whether ko is loaded or not, the sysctl interface will be created, but now we need to add register and unregister interfaces, so the number of lines of code has increased diff --git a/drivers/base/firmware_loader/fallback.c b/drivers/base/firmware_loader/fallback.c index d9ac7296205e..8190653ae9a3 100644 --- a/drivers/base/firmware_loader/fallback.c +++ b/drivers/base/firmware_loader/fallback.c @@ -200,12 +200,16 @@ static struct class firmware_class = { int register_sysfs_loader(void) { + int ret = register_firmware_config_sysctl(); + if (ret != 0) + return ret; checkpatch :( This is my fault, thanks for your guidance return class_register(_class); And if that fails? Yes, it is better to call register_firmware_config_sysctl() after class_register(). thanks for your guidance. } void unregister_sysfs_loader(void) { class_unregister(_class); + unregister_firmware_config_sysctl(); } static ssize_t firmware_loading_show(struct device *dev, diff --git a/drivers/base/firmware_loader/fallback.h b/drivers/base/firmware_loader/fallback.h index 06f4577733a8..7d2cb5f6ceb8 100644 --- a/drivers/base/firmware_loader/fallback.h +++ b/drivers/base/firmware_loader/fallback.h @@ -42,6 +42,17 @@ void fw_fallback_set_default_timeout(void); int register_sysfs_loader(void); void unregister_sysfs_loader(void); +#ifdef CONFIG_SYSCTL +extern int register_firmware_config_sysctl(void); +extern void unregister_firmware_config_sysctl(void); +#else +static inline int register_firmware_config_sysctl(void) +{ + return 0; +} +static inline void unregister_firmware_config_sysctl(void) { } +#endif /* CONFIG_SYSCTL */ + #else /* CONFIG_FW_LOADER_USER_HELPER */ static inline int firmware_fallback_sysfs(struct firmware *fw, const char *name, struct device *device, diff --git a/drivers/base/firmware_loader/fallback_table.c b/drivers/base/firmware_loader/fallback_table.c index 46a731dede6f..4234aa5ee5df 100644 --- a/drivers/base/firmware_loader/fallback_table.c +++ b/drivers/base/firmware_loader/fallback_table.c @@ -24,7 +24,7 @@ struct firmware_fallback_config fw_fallback_config = { EXPORT_SYMBOL_NS_GPL(fw_fallback_config, FIRMWARE_LOADER_PRIVATE); #ifdef CONFIG_SYSCTL -struct ctl_table firmware_config_table[] = { +static struct ctl_table firmware_config_table[] = { { .procname = "force_sysfs_fallback", .data = _fallback_config.force_sysfs_fallback, @@ -45,4 +45,22 @@ struct ctl_table firmware_config_table[] = { }, { } }; -#endif + +static struct ctl_table_header *hdr; +int register_firmware_config_sysctl(void) +{ + if (hdr) + return -EEXIST; How can hdr be set? It's my mistake, register_firmware_config_sysctl() is not exported, there will be no repeated calls. thanks for your guidance. + hdr = register_sysctl_subdir("kernel", "firmware_config", +firmware_config_table); + if (!hdr) + return -ENOMEM; + return 0; +} + +void unregister_firmware_config_sysctl(void) +{ + if (hdr) + unregister_sysctl_table(hdr); Why can't unregister_sysctl_table() take a null pointer value? Sorry, I didn't notice that the unregister_sysctl_table() already checks the input parameters. thanks for your guidance. And what sets 'hdr' (worst name for a static variable) to NULL so that it knows not to be unregistered again as it looks like register_firmware_config_sysctl() could be called multiple times. How about renaming hdr to firmware_config_sysct_table_header? + if (hdr) + return -EEXIST; After deleting this code in register_firmware_config_sysctl(), and considering register_firmware_config_sysctl() and unregister_firmware_config_sysctl() are not exported, whether there is no need to add "hdr = NULL;" ? Thanks Xiaoming Ni
Re: [PATCH v4 1/4] sysctl: Add register_sysctl_init() interface
On 2020/5/29 15:36, Luis Chamberlain wrote: On Fri, May 29, 2020 at 03:27:22PM +0800, Xiaoming Ni wrote: On 2020/5/29 15:09, Luis Chamberlain wrote: On Tue, May 19, 2020 at 11:31:08AM +0800, Xiaoming Ni wrote: --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3358,6 +3358,25 @@ int __init sysctl_init(void) kmemleak_not_leak(hdr); return 0; } + +/* + * The sysctl interface is used to modify the interface value, + * but the feature interface has default values. Even if register_sysctl fails, + * the feature body function can also run. At the same time, malloc small + * fragment of memory during the system initialization phase, almost does + * not fail. Therefore, the function return is designed as void + */ Let's use kdoc while at it. Can you convert this to proper kdoc? Sorry, I do n’t know the format requirements of Kdoc, can you give me some tips for writing? Sure, include/net/mac80211.h is a good example. +void __init register_sysctl_init(const char *path, struct ctl_table *table, +const char *table_name) +{ + struct ctl_table_header *hdr = register_sysctl(path, table); + + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl %s to %s\n", table_name, path); + return; table_name is only used for this, however we can easily just make another _register_sysctl_init() helper first, and then use a macro which will concatenate this to something useful if you want to print a string. I see no point in the description for this, specially since the way it was used was not to be descriptive, but instead just a name followed by some underscore and something else. Good idea, I will fix and send the patch to you as soon as possible No rush :) + } + kmemleak_not_leak(hdr); Is it *wrong* to run kmemleak_not_leak() when hdr was not allocated? If so, can you fix the sysctl __init call itself? I don't understand here, do you mean that register_sysctl_init () does not need to call kmemleak_not_leak (hdr), or does it mean to add check hdr before calling kmemleak_not_leak (hdr) in sysctl_init ()? I'm asking that the way you are adding it, you don't run kmemleak_not_leak(hdr) if the hdr allocation filed. If that is right then it seems that sysctl_init() might not be doing it right. Can that code be shared somehow? Luis void __ref kmemleak_not_leak(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); } EXPORT_SYMBOL(kmemleak_not_leak); In the code of kmemleak_not_leak(), it is verified that the pointer is valid, so kmemleak_not_leak (NULL) will not be a problem. At the same time, there is no need to call kmemleak_not_leak() in the failed branch of register_sysctl_init(). Thanks Xiaoming Ni
Re: [PATCH v4 1/4] sysctl: Add register_sysctl_init() interface
On 2020/5/29 15:09, Luis Chamberlain wrote: On Tue, May 19, 2020 at 11:31:08AM +0800, Xiaoming Ni wrote: --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3358,6 +3358,25 @@ int __init sysctl_init(void) kmemleak_not_leak(hdr); return 0; } + +/* + * The sysctl interface is used to modify the interface value, + * but the feature interface has default values. Even if register_sysctl fails, + * the feature body function can also run. At the same time, malloc small + * fragment of memory during the system initialization phase, almost does + * not fail. Therefore, the function return is designed as void + */ Let's use kdoc while at it. Can you convert this to proper kdoc? Sorry, I do n’t know the format requirements of Kdoc, can you give me some tips for writing? +void __init register_sysctl_init(const char *path, struct ctl_table *table, +const char *table_name) +{ + struct ctl_table_header *hdr = register_sysctl(path, table); + + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl %s to %s\n", table_name, path); + return; table_name is only used for this, however we can easily just make another _register_sysctl_init() helper first, and then use a macro which will concatenate this to something useful if you want to print a string. I see no point in the description for this, specially since the way it was used was not to be descriptive, but instead just a name followed by some underscore and something else. Good idea, I will fix and send the patch to you as soon as possible + } + kmemleak_not_leak(hdr); Is it *wrong* to run kmemleak_not_leak() when hdr was not allocated? If so, can you fix the sysctl __init call itself? I don't understand here, do you mean that register_sysctl_init () does not need to call kmemleak_not_leak (hdr), or does it mean to add check hdr before calling kmemleak_not_leak (hdr) in sysctl_init ()? PS. Since you have given me your series, feel free to send me a patch as a follow up to this in privae and I can integrate it into my tree. Luis Thanks Xiaoming Ni
Re: [PATCH v2 1/1] userfaultfd/sysctl: add vm.unprivileged_userfaultfd
On 2020/5/27 22:21, Peter Xu wrote: On Wed, May 27, 2020 at 02:54:13PM +0800, Xiaoming Ni wrote: On Tue, Mar 19, 2019 at 11:07:22AM +0800, Peter Xu wrote: Add a global sysctl knob "vm.unprivileged_userfaultfd" to control whether userfaultfd is allowed by unprivileged users. When this is set to zero, only privileged users (root user, or users with the CAP_SYS_PTRACE capability) will be able to use the userfaultfd syscalls. Hello Hi, Xiaoming, I am a bit confused about this patch, can you help to answer it. Why the sysctl interface of fs/userfaultfd.c belongs to vm_table instead of fs_table ? https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cefdca0a86be517bc390fc4541e3674b8e7803b0 Because I think it makes more sense to put the new key into where it suites better, irrelevant to which directory the variable is declared. To me, unprivileged_userfaultfd is definitely more suitable for vm rather than fs, because userfaultfd is really about memory management rather than file system. Thanks, Thank you for your answer Since userfaultfd and vm are more closely related, will there be consideration to move fs/userfaultfd.c to the mm directory in the future? Thanks Xiaoming Ni
Re: [PATCH v2 1/1] userfaultfd/sysctl: add vm.unprivileged_userfaultfd
On Tue, Mar 19, 2019 at 11:07:22AM +0800, Peter Xu wrote: > Add a global sysctl knob "vm.unprivileged_userfaultfd" to control > whether userfaultfd is allowed by unprivileged users. When this is > set to zero, only privileged users (root user, or users with the > CAP_SYS_PTRACE capability) will be able to use the userfaultfd > syscalls. Hello I am a bit confused about this patch, can you help to answer it. Why the sysctl interface of fs/userfaultfd.c belongs to vm_table instead of fs_table ? https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cefdca0a86be517bc390fc4541e3674b8e7803b0 Thanks Xiaoming Ni
Re: [PATCH v4 0/4] cleaning up the sysctls table (hung_task watchdog)
On 2020/5/20 11:31, Andrew Morton wrote: On Tue, 19 May 2020 11:31:07 +0800 Xiaoming Ni wrote: Kernel/sysctl.c eek! fs/proc/proc_sysctl.c| 2 +- include/linux/sched/sysctl.h | 14 +-- include/linux/sysctl.h | 13 ++- kernel/hung_task.c | 77 +++- kernel/sysctl.c | 214 +++ kernel/watchdog.c| 101 6 files changed, 224 insertions(+), 197 deletions(-) Here's what we presently have happening in linux-next's kernel/sysctl.c: sysctl.c | 3109 ++- 1 file changed, 1521 insertions(+), 1588 deletions(-) So this is not a good time for your patch! Can I suggest that you set the idea aside and take a look after 5.8-rc1 is released? ok, I will make v5 patch based on 5.8-rc1 after 5.8-rc1 is released, And add more sysctl table cleanup. Thanks Xiaoming Ni
Re: [PATCH v4 2/4] sysctl: Move some boundary constants form sysctl.c to sysctl_vals
On 2020/5/19 12:44, Tetsuo Handa wrote: On 2020/05/19 12:31, Xiaoming Ni wrote: Some boundary (.extra1 .extra2) constants (E.g: neg_one two) in sysctl.c are used in multiple features. Move these variables to sysctl_vals to avoid adding duplicate variables when cleaning up sysctls table. Signed-off-by: Xiaoming Ni Reviewed-by: Kees Cook I feel that it is use of void *extra1; void *extra2; in "struct ctl_table" that requires constant values indirection. Can't we get rid of sysctl_vals using some "union" like below? struct ctl_table { const char *procname; /* Text ID for /proc/sys, or zero */ void *data; int maxlen; umode_t mode; struct ctl_table *child;/* Deprecated */ proc_handler *proc_handler; /* Callback for text formatting */ struct ctl_table_poll *poll; union { void *min_max_ptr[2]; int min_max_int[2]; long min_max_long[2]; }; } __randomize_layout; . net/decnet/dn_dev.c: static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms) { struct dn_dev_sysctl_table *t; int i; char path[sizeof("net/decnet/conf/") + IFNAMSIZ]; t = kmemdup(_dev_sysctl, sizeof(*t), GFP_KERNEL); if (t == NULL) return; for(i = 0; i < ARRAY_SIZE(t->dn_dev_vars) - 1; i++) { long offset = (long)t->dn_dev_vars[i].data; t->dn_dev_vars[i].data = ((char *)parms) + offset; } snprintf(path, sizeof(path), "net/decnet/conf/%s", dev? dev->name : parms->name); t->dn_dev_vars[0].extra1 = (void *)dev; t->sysctl_header = register_net_sysctl(_net, path, t->dn_dev_vars); if (t->sysctl_header == NULL) kfree(t); else parms->sysctl = t; } A small amount of code is not used as a boundary value when using extra1. This scenario may not be suitable for renaming to min_max_ptr. Should we add const to extra1 extra2 ? --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -124,8 +124,8 @@ struct ctl_table { struct ctl_table *child;/* Deprecated */ proc_handler *proc_handler; /* Callback for text formatting */ struct ctl_table_poll *poll; - void *extra1; - void *extra2; + const void *extra1; + const void *extra2; } __randomize_layout; Thanks Xiaoming Ni
[PATCH v4 0/4] cleaning up the sysctls table (hung_task watchdog)
Kernel/sysctl.c contains more than 190 interface files, and there are a large number of config macro controls. When modifying the sysctl interface directly in kernel/sysctl.c, conflicts are very easy to occur. E.g: https://lore.kernel.org/lkml/99095805-8cbe-d140-e2f1-0c5a3e84d...@huawei.com/ Use register_sysctl() to register the sysctl interface to avoid merge conflicts when different features modify sysctl.c at the same time. So consider cleaning up the sysctls table, details are in: https://kernelnewbies.org/KernelProjects/proc https://lore.kernel.org/lkml/20200513141421.gp11...@42.do-not-panic.com/#t The current patch set extracts register_sysctl_init and some sysctl_vals variables, and clears the interface of hung_task and watchdog in sysctl.c. The current patch set is based on linux-next, commit 72bc15d0018ebfbc9 ("Add linux-next specific files for 20200518"). changes in v4: Handle the conflict with the commit d4ee116819ed714f ("kernel/hung_task.c: introduce sysctl to print all traces when a hung task is detected"), move the sysctl interface hung_task_all_cpu_backtrace to hung_task.c. V3: https://lore.kernel.org/lkml/1589774397-42485-1-git-send-email-nixiaom...@huawei.com/ base on commit b9bbe6ed63b2b9 ("Linux 5.7-rc6") changes in v3: 1. make hung_task_timeout_max to be const 2. fix build warning: kernel/watchdog.c:779:14: warning: initialization discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers] .extra2 = , ^ V2: https://lore.kernel.org/lkml/1589619315-65827-1-git-send-email-nixiaom...@huawei.com/ changes in v2: 1. Adjusted the order of patches, first do public function extraction, then do feature code movement 2. Move hung_task sysctl to hung_task.c instead of adding new file 3. Extract multiple common variables instead of only neg_one, and keep the order of member values in sysctl_vals 4. Add const modification to the variable sixty in watchdog sysctl V1: https://lore.kernel.org/lkml/1589517224-123928-1-git-send-email-nixiaom...@huawei.com/ Xiaoming Ni (4): sysctl: Add register_sysctl_init() interface sysctl: Move some boundary constants form sysctl.c to sysctl_vals hung_task: Move hung_task sysctl interface to hung_task.c watchdog: move watchdog sysctl interface to watchdog.c fs/proc/proc_sysctl.c| 2 +- include/linux/sched/sysctl.h | 14 +-- include/linux/sysctl.h | 13 ++- kernel/hung_task.c | 77 +++- kernel/sysctl.c | 214 +++ kernel/watchdog.c| 101 6 files changed, 224 insertions(+), 197 deletions(-) -- 1.8.5.6
[PATCH v4 1/4] sysctl: Add register_sysctl_init() interface
In order to eliminate the duplicate code for registering the sysctl interface during the initialization of each feature, add the register_sysctl_init() interface Signed-off-by: Xiaoming Ni Reviewed-by: Kees Cook --- include/linux/sysctl.h | 2 ++ kernel/sysctl.c| 19 +++ 2 files changed, 21 insertions(+) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 50bb7f3..857ba93 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -197,6 +197,8 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +extern void register_sysctl_init(const char *path, struct ctl_table *table, +const char *table_name); void do_sysctl_args(void); extern int pwrsw_enabled; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cc1fcba..8afd713 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3358,6 +3358,25 @@ int __init sysctl_init(void) kmemleak_not_leak(hdr); return 0; } + +/* + * The sysctl interface is used to modify the interface value, + * but the feature interface has default values. Even if register_sysctl fails, + * the feature body function can also run. At the same time, malloc small + * fragment of memory during the system initialization phase, almost does + * not fail. Therefore, the function return is designed as void + */ +void __init register_sysctl_init(const char *path, struct ctl_table *table, +const char *table_name) +{ + struct ctl_table_header *hdr = register_sysctl(path, table); + + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl %s to %s\n", table_name, path); + return; + } + kmemleak_not_leak(hdr); +} #endif /* CONFIG_SYSCTL */ /* * No sense putting this after each symbol definition, twice, -- 1.8.5.6
[PATCH v4 2/4] sysctl: Move some boundary constants form sysctl.c to sysctl_vals
Some boundary (.extra1 .extra2) constants (E.g: neg_one two) in sysctl.c are used in multiple features. Move these variables to sysctl_vals to avoid adding duplicate variables when cleaning up sysctls table. Signed-off-by: Xiaoming Ni Reviewed-by: Kees Cook --- fs/proc/proc_sysctl.c | 2 +- include/linux/sysctl.h | 11 --- kernel/sysctl.c| 39 +-- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5b405f3..3d65e7d 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -24,7 +24,7 @@ static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { 0, 1, INT_MAX }; +const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 1000, INT_MAX }; EXPORT_SYMBOL(sysctl_vals); /* Support for permanently empty directories */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 857ba93..97586ee 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -38,9 +38,14 @@ struct ctl_dir; /* Keep the same order as in fs/proc/proc_sysctl.c */ -#define SYSCTL_ZERO((void *)_vals[0]) -#define SYSCTL_ONE ((void *)_vals[1]) -#define SYSCTL_INT_MAX ((void *)_vals[2]) +#define SYSCTL_NEG_ONE ((void *)_vals[0]) +#define SYSCTL_ZERO((void *)_vals[1]) +#define SYSCTL_ONE ((void *)_vals[2]) +#define SYSCTL_TWO ((void *)_vals[3]) +#define SYSCTL_FOUR((void *)_vals[4]) +#define SYSCTL_ONE_HUNDRED ((void *)_vals[5]) +#define SYSCTL_ONE_THOUSAND((void *)_vals[6]) +#define SYSCTL_INT_MAX ((void *)_vals[7]) extern const int sysctl_vals[]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8afd713..38469bf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -111,14 +111,9 @@ static int sixty = 60; #endif -static int __maybe_unused neg_one = -1; -static int __maybe_unused two = 2; -static int __maybe_unused four = 4; static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; -static int one_hundred = 100; -static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 1; #endif @@ -1885,7 +1880,7 @@ int proc_do_static_key(struct ctl_table *table, int write, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = _one, + .extra1 = SYSCTL_NEG_ONE, .extra2 = SYSCTL_ONE, }, #endif @@ -2227,7 +,7 @@ int proc_do_static_key(struct ctl_table *table, int write, .mode = 0644, .proc_handler = proc_dointvec_minmax_sysadmin, .extra1 = SYSCTL_ZERO, - .extra2 = , + .extra2 = SYSCTL_TWO, }, #endif { @@ -2487,7 +2482,7 @@ int proc_do_static_key(struct ctl_table *table, int write, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = _one, + .extra1 = SYSCTL_NEG_ONE, }, #endif #ifdef CONFIG_RT_MUTEXES @@ -2549,7 +2544,7 @@ int proc_do_static_key(struct ctl_table *table, int write, .mode = 0644, .proc_handler = perf_cpu_time_max_percent_handler, .extra1 = SYSCTL_ZERO, - .extra2 = _hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "perf_event_max_stack", @@ -2567,7 +2562,7 @@ int proc_do_static_key(struct ctl_table *table, int write, .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, - .extra2 = _thousand, + .extra2 = SYSCTL_ONE_THOUSAND, }, #endif { @@ -2642,7 +2637,7 @@ int proc_do_static_key(struct ctl_table *table, int write, .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = , + .extra2 = SYSCTL_TWO, }, { .procname = "panic_on_oom", @@ -2651,7 +2646,7 @@ int proc_do_static_key(struct ctl_table *table, int write, .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = , + .extra2 = SYSCTL_TWO, }, { .procname = "oom_kill_allocating_task", @@ -2696,7 +2691,7 @@ int proc_do_static_key(struct ctl_table *table, int write,
[PATCH v4 4/4] watchdog: move watchdog sysctl interface to watchdog.c
Move watchdog syscl interface to watchdog.c. Use register_sysctl() to register the sysctl interface to avoid merge conflicts when different features modify sysctl.c at the same time. Signed-off-by: Xiaoming Ni Reviewed-by: Kees Cook --- kernel/sysctl.c | 96 --- kernel/watchdog.c | 101 ++ 2 files changed, 101 insertions(+), 96 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b7fd4e6..88235fa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -100,16 +100,10 @@ #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include #endif -#ifdef CONFIG_LOCKUP_DETECTOR -#include -#endif #if defined(CONFIG_SYSCTL) /* Constants used for minimum and maximum */ -#ifdef CONFIG_LOCKUP_DETECTOR -static int sixty = 60; -#endif static unsigned long zero_ul; static unsigned long one_ul = 1; @@ -2231,96 +2225,6 @@ int proc_do_static_key(struct ctl_table *table, int write, .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_LOCKUP_DETECTOR) - { - .procname = "watchdog", - .data = _user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_thresh", - .data = _thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog_thresh, - .extra1 = SYSCTL_ZERO, - .extra2 = , - }, - { - .procname = "nmi_watchdog", - .data = _watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = NMI_WATCHDOG_SYSCTL_PERM, - .proc_handler = proc_nmi_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_cpumask", - .data = _cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - { - .procname = "soft_watchdog", - .data = _watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_soft_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "softlockup_panic", - .data = _panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "softlockup_all_cpu_backtrace", - .data = _softlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR - { - .procname = "hardlockup_panic", - .data = _panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "hardlockup_all_cpu_backtrace", - .data = _hardlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#endif - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1b93953..b000326 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -758,6 +758,106 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, mutex_unlock
[PATCH v4 3/4] hung_task: Move hung_task sysctl interface to hung_task.c
Move hung_task sysctl interface to hung_task.c. Use register_sysctl() to register the sysctl interface to avoid merge conflicts when different features modify sysctl.c at the same time. Signed-off-by: Xiaoming Ni Reviewed-by: Kees Cook --- include/linux/sched/sysctl.h | 14 +--- kernel/hung_task.c | 77 +++- kernel/sysctl.c | 62 --- 3 files changed, 77 insertions(+), 76 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 660ac49..fcd397a8 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -7,20 +7,8 @@ struct ctl_table; #ifdef CONFIG_DETECT_HUNG_TASK - -#ifdef CONFIG_SMP -extern unsigned int sysctl_hung_task_all_cpu_backtrace; -#else -#define sysctl_hung_task_all_cpu_backtrace 0 -#endif /* CONFIG_SMP */ - -extern int sysctl_hung_task_check_count; -extern unsigned int sysctl_hung_task_panic; +/* used for hung_task and block/ */ extern unsigned long sysctl_hung_task_timeout_secs; -extern unsigned long sysctl_hung_task_check_interval_secs; -extern int sysctl_hung_task_warnings; -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); #else /* Avoid need for ifdefs elsewhere in the code */ enum { sysctl_hung_task_timeout_secs = 0 }; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index a672db8..d67df599 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -63,6 +63,9 @@ * Defaults to 0, can be changed via sysctl. */ unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#else +/* Avoid need for ifdefs elsewhere in the code */ +enum { sysctl_hung_task_timeout_secs = 0 }; #endif /* CONFIG_SMP */ /* @@ -265,10 +268,11 @@ static long hung_timeout_jiffies(unsigned long last_checked, MAX_SCHEDULE_TIMEOUT; } +#ifdef CONFIG_SYSCTL /* * Process updating of timeout sysctl */ -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, +static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -285,6 +289,76 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ +static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ); +static struct ctl_table hung_task_sysctls[] = { +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = _hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ + { + .procname = "hung_task_panic", + .data = _hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hung_task_check_count", + .data = _hung_task_check_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "hung_task_timeout_secs", + .data = _hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)_task_timeout_max, + }, + { + .procname = "hung_task_check_interval_secs", + .data = _hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)_task_timeout_max, + }, + { + .procname = "hung_task_warnings", + .data = _hung_task_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_NEG_ONE, + }, + {} +}; + +static void __init hung_task_sysctl_init(void) +{ + register_sysctl_init("ker
Re: [PATCH v3 0/4] cleaning up the sysctls table (hung_task watchdog)
On 2020/5/19 1:16, Luis Chamberlain wrote: On Mon, May 18, 2020 at 11:59:53AM +0800, Xiaoming Ni wrote: Kernel/sysctl.c contains more than 190 interface files, and there are a large number of config macro controls. When modifying the sysctl interface directly in kernel/sysctl.c, conflicts are very easy to occur. E.g: https://lkml.org/lkml/2020/5/10/413. FWIW un the future please avoid using lkmk.org and instead use https://lkml.kernel.org/r/ for references. Use register_sysctl() to register the sysctl interface to avoid merge conflicts when different features modify sysctl.c at the same time. So consider cleaning up the sysctls table, details are in: https://kernelnewbies.org/KernelProjects/proc https://lkml.org/lkml/2020/5/13/990 The current patch set extracts register_sysctl_init and some sysctl_vals variables, and clears the interface of hung_task and watchdog in sysctl.c. The current patch set is based on commit b9bbe6ed63b2b9 ("Linux 5.7-rc6"), which conflicts with the latest branch of linux-next: 9b4caf6941fc41d ("kernel / hung_task.c: introduce sysctl to print all traces when a hung task is detected") Should I modify to make patch based on the "linux-next" branch to avoid conflicts, or other branches? If you can do that, that would be appreciated. I have a sysctl fs cleanup stuff, so I can take your patches, and put my work ont op of yours and then send this to Andrew once done. Luis Ok, I will redo the v4 version based on the linux-next branch as soon as possible I want to continue to participate in the subsequent sysctl cleanup, how to push the subsequent cleanup patch to your series, and minimize conflict Thanks Xiaoming Ni
[PATCH v3 4/4] watchdog: move watchdog sysctl interface to watchdog.c
Move watchdog syscl interface to watchdog.c. Use register_sysctl() to register the sysctl interface to avoid merge conflicts when different features modify sysctl.c at the same time. Signed-off-by: Xiaoming Ni Reviewed-by: Kees Cook --- kernel/sysctl.c | 96 --- kernel/watchdog.c | 101 ++ 2 files changed, 101 insertions(+), 96 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 90e3ea8..cd85a1c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -97,9 +97,6 @@ #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include #endif -#ifdef CONFIG_LOCKUP_DETECTOR -#include -#endif #if defined(CONFIG_SYSCTL) @@ -120,9 +117,6 @@ #endif /* Constants used for minimum and maximum */ -#ifdef CONFIG_LOCKUP_DETECTOR -static int sixty = 60; -#endif static unsigned long zero_ul; static unsigned long one_ul = 1; @@ -883,96 +877,6 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_LOCKUP_DETECTOR) - { - .procname = "watchdog", - .data = _user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_thresh", - .data = _thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog_thresh, - .extra1 = SYSCTL_ZERO, - .extra2 = , - }, - { - .procname = "nmi_watchdog", - .data = _watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = NMI_WATCHDOG_SYSCTL_PERM, - .proc_handler = proc_nmi_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_cpumask", - .data = _cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - { - .procname = "soft_watchdog", - .data = _watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_soft_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "softlockup_panic", - .data = _panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "softlockup_all_cpu_backtrace", - .data = _softlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR - { - .procname = "hardlockup_panic", - .data = _panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "hardlockup_all_cpu_backtrace", - .data = _hardlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#endif - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b6b1f54..8c833b6 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -756,6 +756,106 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, mutex_unlock
[PATCH v3 0/4] cleaning up the sysctls table (hung_task watchdog)
Kernel/sysctl.c contains more than 190 interface files, and there are a large number of config macro controls. When modifying the sysctl interface directly in kernel/sysctl.c, conflicts are very easy to occur. E.g: https://lkml.org/lkml/2020/5/10/413. Use register_sysctl() to register the sysctl interface to avoid merge conflicts when different features modify sysctl.c at the same time. So consider cleaning up the sysctls table, details are in: https://kernelnewbies.org/KernelProjects/proc https://lkml.org/lkml/2020/5/13/990 The current patch set extracts register_sysctl_init and some sysctl_vals variables, and clears the interface of hung_task and watchdog in sysctl.c. The current patch set is based on commit b9bbe6ed63b2b9 ("Linux 5.7-rc6"), which conflicts with the latest branch of linux-next: 9b4caf6941fc41d ("kernel / hung_task.c: introduce sysctl to print all traces when a hung task is detected") Should I modify to make patch based on the "linux-next" branch to avoid conflicts, or other branches? changes in v3: 1. make hung_task_timeout_max to be const 2. fix build warning: kernel/watchdog.c:779:14: warning: initialization discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers] .extra2 = , ^ V2: https://lkml.org/lkml/2020/5/16/81 changes in v2: 1. Adjusted the order of patches, first do public function extraction, then do feature code movement 2. Move hung_task sysctl to hung_task.c instead of adding new file 3. Extract multiple common variables instead of only neg_one, and keep the order of member values in sysctl_vals 4. Add const modification to the variable sixty in watchdog sysctl V1: https://lkml.org/lkml/2020/5/15/17 Xiaoming Ni (4): sysctl: Add register_sysctl_init() interface sysctl: Move some boundary constants form sysctl.c to sysctl_vals hung_task: Move hung_task sysctl interface to hung_task.c watchdog: move watchdog sysctl interface to watchdog.c fs/proc/proc_sysctl.c| 2 +- include/linux/sched/sysctl.h | 8 +- include/linux/sysctl.h | 13 ++- kernel/hung_task.c | 63 +- kernel/sysctl.c | 202 --- kernel/watchdog.c| 101 ++ 6 files changed, 210 insertions(+), 179 deletions(-) -- 1.8.5.6
[PATCH v3 3/4] hung_task: Move hung_task sysctl interface to hung_task.c
Move hung_task sysctl interface to hung_task.c. Use register_sysctl() to register the sysctl interface to avoid merge conflicts when different features modify sysctl.c at the same time. Signed-off-by: Xiaoming Ni Reviewed-by: Kees Cook --- include/linux/sched/sysctl.h | 8 +- kernel/hung_task.c | 63 +++- kernel/sysctl.c | 50 --- 3 files changed, 63 insertions(+), 58 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d4f6215..c075e09 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -7,14 +7,8 @@ struct ctl_table; #ifdef CONFIG_DETECT_HUNG_TASK -extern int sysctl_hung_task_check_count; -extern unsigned int sysctl_hung_task_panic; +/* used for hung_task and block/ */ extern unsigned long sysctl_hung_task_timeout_secs; -extern unsigned long sysctl_hung_task_check_interval_secs; -extern int sysctl_hung_task_warnings; -extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, -void __user *buffer, -size_t *lenp, loff_t *ppos); #else /* Avoid need for ifdefs elsewhere in the code */ enum { sysctl_hung_task_timeout_secs = 0 }; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 14a625c..d010c96 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -215,10 +215,11 @@ static long hung_timeout_jiffies(unsigned long last_checked, MAX_SCHEDULE_TIMEOUT; } +#ifdef CONFIG_SYSCTL /* * Process updating of timeout sysctl */ -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, +static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -235,6 +236,65 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ +static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ); +static struct ctl_table hung_task_sysctls[] = { + { + .procname = "hung_task_panic", + .data = _hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hung_task_check_count", + .data = _hung_task_check_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "hung_task_timeout_secs", + .data = _hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)_task_timeout_max, + }, + { + .procname = "hung_task_check_interval_secs", + .data = _hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)_task_timeout_max, + }, + { + .procname = "hung_task_warnings", + .data = _hung_task_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_NEG_ONE, + }, + {} +}; + +static void __init hung_task_sysctl_init(void) +{ + register_sysctl_init("kernel", hung_task_sysctls, "hung_task_sysctls"); +} +#else +#define hung_task_sysctl_init() do { } while (0) +#endif + + static atomic_t reset_hung_task = ATOMIC_INIT(0); void reset_hung_task_detector(void) @@ -304,6 +364,7 @@ static int __init hung_task_init(void) pm_notifier(hungtask_pm_notify, 0); watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); + hung_task_sysctl_init(); return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 76c3237..90e3ea8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -144,13 +144,6 @@ static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -/* - * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs - * and hung_tas
[PATCH v3 1/4] sysctl: Add register_sysctl_init() interface
In order to eliminate the duplicate code for registering the sysctl interface during the initialization of each feature, add the register_sysctl_init() interface Signed-off-by: Xiaoming Ni Reviewed-by: Kees Cook --- include/linux/sysctl.h | 2 ++ kernel/sysctl.c| 19 +++ 2 files changed, 21 insertions(+) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 02fa844..43f8ef9 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -206,6 +206,8 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +extern void register_sysctl_init(const char *path, struct ctl_table *table, +const char *table_name); extern struct ctl_table sysctl_mount_point[]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a176d8..c96122f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1968,6 +1968,25 @@ int __init sysctl_init(void) return 0; } +/* + * The sysctl interface is used to modify the interface value, + * but the feature interface has default values. Even if register_sysctl fails, + * the feature body function can also run. At the same time, malloc small + * fragment of memory during the system initialization phase, almost does + * not fail. Therefore, the function return is designed as void + */ +void __init register_sysctl_init(const char *path, struct ctl_table *table, +const char *table_name) +{ + struct ctl_table_header *hdr = register_sysctl(path, table); + + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl %s to %s\n", table_name, path); + return; + } + kmemleak_not_leak(hdr); +} + #endif /* CONFIG_SYSCTL */ /* -- 1.8.5.6
[PATCH v3 2/4] sysctl: Move some boundary constants form sysctl.c to sysctl_vals
Some boundary (.extra1 .extra2) constants (E.g: neg_one two) in sysctl.c are used in multiple features. Move these variables to sysctl_vals to avoid adding duplicate variables when cleaning up sysctls table. Signed-off-by: Xiaoming Ni Reviewed-by: Kees Cook --- fs/proc/proc_sysctl.c | 2 +- include/linux/sysctl.h | 11 --- kernel/sysctl.c| 37 - 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b6f5d45..3f77e64 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -23,7 +23,7 @@ static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { 0, 1, INT_MAX }; +const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 1000, INT_MAX }; EXPORT_SYMBOL(sysctl_vals); /* Support for permanently empty directories */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 43f8ef9..bf97c30 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -38,9 +38,14 @@ struct ctl_dir; /* Keep the same order as in fs/proc/proc_sysctl.c */ -#define SYSCTL_ZERO((void *)_vals[0]) -#define SYSCTL_ONE ((void *)_vals[1]) -#define SYSCTL_INT_MAX ((void *)_vals[2]) +#define SYSCTL_NEG_ONE ((void *)_vals[0]) +#define SYSCTL_ZERO((void *)_vals[1]) +#define SYSCTL_ONE ((void *)_vals[2]) +#define SYSCTL_TWO ((void *)_vals[3]) +#define SYSCTL_FOUR((void *)_vals[4]) +#define SYSCTL_ONE_HUNDRED ((void *)_vals[5]) +#define SYSCTL_ONE_THOUSAND((void *)_vals[6]) +#define SYSCTL_INT_MAX ((void *)_vals[7]) extern const int sysctl_vals[]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c96122f..76c3237 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -124,14 +124,9 @@ static int sixty = 60; #endif -static int __maybe_unused neg_one = -1; -static int __maybe_unused two = 2; -static int __maybe_unused four = 4; static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; -static int one_hundred = 100; -static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 1; #endif @@ -547,7 +542,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = _one, + .extra1 = SYSCTL_NEG_ONE, .extra2 = SYSCTL_ONE, }, #endif @@ -878,7 +873,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = proc_dointvec_minmax_sysadmin, .extra1 = SYSCTL_ZERO, - .extra2 = , + .extra2 = SYSCTL_TWO, }, #endif { @@ -1187,7 +1182,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = perf_cpu_time_max_percent_handler, .extra1 = SYSCTL_ZERO, - .extra2 = _hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "perf_event_max_stack", @@ -1205,7 +1200,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, - .extra2 = _thousand, + .extra2 = SYSCTL_ONE_THOUSAND, }, #endif { @@ -1280,7 +1275,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = , + .extra2 = SYSCTL_TWO, }, { .procname = "panic_on_oom", @@ -1289,7 +1284,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = , + .extra2 = SYSCTL_TWO, }, { .procname = "oom_kill_allocating_task", @@ -1334,7 +1329,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = dirty_background_ratio_handler, .extra1 = SYSCTL_ZERO, - .extra2 = _hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_background_b
Re: [PATCH v2 3/4] hung_task: Move hung_task sysctl interface to hung_task.c
On 2020/5/17 10:43, Kees Cook wrote: On Sat, May 16, 2020 at 04:55:14PM +0800, Xiaoming Ni wrote: +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ +static unsigned long hung_task_timeout_max = (LONG_MAX / HZ); Please make this const. With that done, yes, looks great! Reviewed-by: Kees Cook Thank you for your guidance, I will fix it in v3 In addition, I am a bit confused about the patch submission, and hope to get everyone's answer. I made this patch based on the master branch. But as in conflict at https://lkml.org/lkml/2020/5/10/413, my patch will inevitably conflict. Should I modify to make patch based on "linux-next" branch to avoid conflict, or other branches? Thanks Xiaoming Ni
[PATCH v2 1/4] sysctl: Add register_sysctl_init() interface
In order to eliminate the duplicate code for registering the sysctl interface during the initialization of each feature, add the register_sysctl_init() interface Signed-off-by: Xiaoming Ni --- include/linux/sysctl.h | 2 ++ kernel/sysctl.c| 19 +++ 2 files changed, 21 insertions(+) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 02fa844..43f8ef9 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -206,6 +206,8 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +extern void register_sysctl_init(const char *path, struct ctl_table *table, +const char *table_name); extern struct ctl_table sysctl_mount_point[]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b9ff323..20c31f5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1970,6 +1970,25 @@ int __init sysctl_init(void) return 0; } +/* + * The sysctl interface is used to modify the interface value, + * but the feature interface has default values. Even if register_sysctl fails, + * the feature body function can also run. At the same time, malloc small + * fragment of memory during the system initialization phase, almost does + * not fail. Therefore, the function return is designed as void + */ +void __init register_sysctl_init(const char *path, struct ctl_table *table, +const char *table_name) +{ + struct ctl_table_header *hdr = register_sysctl(path, table); + + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl %s to %s\n", table_name, path); + return; + } + kmemleak_not_leak(hdr); +} + #endif /* CONFIG_SYSCTL */ /* -- 1.8.5.6
[PATCH v2 0/4] cleaning up the sysctls table (hung_task watchdog)
Kernel/sysctl.c contains more than 190 interface files, and there are a large number of config macro controls. When modifying the sysctl interface directly in kernel/sysctl.c, conflicts are very easy to occur. E.g: https://lkml.org/lkml/2020/5/10/413. Use register_sysctl() to register the sysctl interface to avoid merge conflicts when different features modify sysctl.c at the same time. So consider cleaning up the sysctls table, details are in: https://kernelnewbies.org/KernelProjects/proc https://lkml.org/lkml/2020/5/13/990 The current patch set extracts register_sysctl_init and some sysctl_vals variables, and clears the interface of hung_task and watchdog in sysctl.c. changes in v2: 1. Adjusted the order of patches, first do public function extraction, then do feature code movement 2. Move hung_task sysctl to hung_task.c instead of adding new file 3. Extract multiple common variables instead of only neg_one, and keep the order of member values in sysctl_vals 4. Add const modification to the variable sixty in watchdog sysctl V1: https://lkml.org/lkml/2020/5/15/17 Xiaoming Ni (4): sysctl: Add register_sysctl_init() interface sysctl: Move some boundary constants form sysctl.c to sysctl_vals hung_task: Move hung_task sysctl interface to hung_task.c watchdog: move watchdog sysctl interface to watchdog.c fs/proc/proc_sysctl.c| 2 +- include/linux/sched/sysctl.h | 8 +- include/linux/sysctl.h | 13 ++- kernel/hung_task.c | 63 +- kernel/sysctl.c | 202 --- kernel/watchdog.c| 101 ++ 6 files changed, 210 insertions(+), 179 deletions(-) -- 1.8.5.6
[PATCH v2 3/4] hung_task: Move hung_task sysctl interface to hung_task.c
Move hung_task sysctl interface to hung_task.c. Use register_sysctl() to register the sysctl interface to avoid merge conflicts when different features modify sysctl.c at the same time. Signed-off-by: Xiaoming Ni --- include/linux/sched/sysctl.h | 8 +- kernel/hung_task.c | 63 +++- kernel/sysctl.c | 50 --- 3 files changed, 63 insertions(+), 58 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d4f6215..c075e09 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -7,14 +7,8 @@ struct ctl_table; #ifdef CONFIG_DETECT_HUNG_TASK -extern int sysctl_hung_task_check_count; -extern unsigned int sysctl_hung_task_panic; +/* used for hung_task and block/ */ extern unsigned long sysctl_hung_task_timeout_secs; -extern unsigned long sysctl_hung_task_check_interval_secs; -extern int sysctl_hung_task_warnings; -extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, -void __user *buffer, -size_t *lenp, loff_t *ppos); #else /* Avoid need for ifdefs elsewhere in the code */ enum { sysctl_hung_task_timeout_secs = 0 }; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 14a625c..76ea496 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -215,10 +215,11 @@ static long hung_timeout_jiffies(unsigned long last_checked, MAX_SCHEDULE_TIMEOUT; } +#ifdef CONFIG_SYSCTL /* * Process updating of timeout sysctl */ -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, +static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -235,6 +236,65 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ +static unsigned long hung_task_timeout_max = (LONG_MAX / HZ); +static struct ctl_table hung_task_sysctls[] = { + { + .procname = "hung_task_panic", + .data = _hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hung_task_check_count", + .data = _hung_task_check_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "hung_task_timeout_secs", + .data = _hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = _task_timeout_max, + }, + { + .procname = "hung_task_check_interval_secs", + .data = _hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = _task_timeout_max, + }, + { + .procname = "hung_task_warnings", + .data = _hung_task_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_NEG_ONE, + }, + {} +}; + +static void __init hung_task_sysctl_init(void) +{ + register_sysctl_init("kernel", hung_task_sysctls, "hung_task_sysctls"); +} +#else +#define hung_task_sysctl_init() do { } while (0) +#endif + + static atomic_t reset_hung_task = ATOMIC_INIT(0); void reset_hung_task_detector(void) @@ -304,6 +364,7 @@ static int __init hung_task_init(void) pm_notifier(hungtask_pm_notify, 0); watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); + hung_task_sysctl_init(); return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5f08cc2..49e9541 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -144,13 +144,6 @@ static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -/* - * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs - * and hung_task_check_interval_secs - */ -#ifdef CONFIG_DETEC
[PATCH v2 4/4] watchdog: move watchdog sysctl interface to watchdog.c
Move watchdog syscl interface to watchdog.c. Use register_sysctl() to register the sysctl interface to avoid merge conflicts when different features modify sysctl.c at the same time. Signed-off-by: Xiaoming Ni --- kernel/sysctl.c | 96 --- kernel/watchdog.c | 101 ++ 2 files changed, 101 insertions(+), 96 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 49e9541..efe6172 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -97,9 +97,6 @@ #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include #endif -#ifdef CONFIG_LOCKUP_DETECTOR -#include -#endif #if defined(CONFIG_SYSCTL) @@ -120,9 +117,6 @@ #endif /* Constants used for minimum and maximum */ -#ifdef CONFIG_LOCKUP_DETECTOR -static int sixty = 60; -#endif static unsigned long zero_ul; static unsigned long one_ul = 1; @@ -883,96 +877,6 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_LOCKUP_DETECTOR) - { - .procname = "watchdog", - .data = _user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_thresh", - .data = _thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog_thresh, - .extra1 = SYSCTL_ZERO, - .extra2 = , - }, - { - .procname = "nmi_watchdog", - .data = _watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = NMI_WATCHDOG_SYSCTL_PERM, - .proc_handler = proc_nmi_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_cpumask", - .data = _cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - { - .procname = "soft_watchdog", - .data = _watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_soft_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "softlockup_panic", - .data = _panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "softlockup_all_cpu_backtrace", - .data = _softlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR - { - .procname = "hardlockup_panic", - .data = _panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "hardlockup_all_cpu_backtrace", - .data = _hardlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#endif - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b6b1f54..f5d19be 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -756,6 +756,106 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, mutex_unlock
[PATCH v2 2/4] sysctl: Move some boundary constants form sysctl.c to sysctl_vals
Some boundary (.extra1 .extra2) constants (E.g: neg_one two) in sysctl.c are used in multiple features. Move these variables to sysctl_vals to avoid adding duplicate variables when cleaning up sysctls table. Signed-off-by: Xiaoming Ni --- fs/proc/proc_sysctl.c | 2 +- include/linux/sysctl.h | 11 --- kernel/sysctl.c| 37 - 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b6f5d45..3f77e64 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -23,7 +23,7 @@ static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { 0, 1, INT_MAX }; +const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 1000, INT_MAX }; EXPORT_SYMBOL(sysctl_vals); /* Support for permanently empty directories */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 43f8ef9..bf97c30 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -38,9 +38,14 @@ struct ctl_dir; /* Keep the same order as in fs/proc/proc_sysctl.c */ -#define SYSCTL_ZERO((void *)_vals[0]) -#define SYSCTL_ONE ((void *)_vals[1]) -#define SYSCTL_INT_MAX ((void *)_vals[2]) +#define SYSCTL_NEG_ONE ((void *)_vals[0]) +#define SYSCTL_ZERO((void *)_vals[1]) +#define SYSCTL_ONE ((void *)_vals[2]) +#define SYSCTL_TWO ((void *)_vals[3]) +#define SYSCTL_FOUR((void *)_vals[4]) +#define SYSCTL_ONE_HUNDRED ((void *)_vals[5]) +#define SYSCTL_ONE_THOUSAND((void *)_vals[6]) +#define SYSCTL_INT_MAX ((void *)_vals[7]) extern const int sysctl_vals[]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 20c31f5..5f08cc2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -124,14 +124,9 @@ static int sixty = 60; #endif -static int __maybe_unused neg_one = -1; -static int __maybe_unused two = 2; -static int __maybe_unused four = 4; static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; -static int one_hundred = 100; -static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 1; #endif @@ -547,7 +542,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = _one, + .extra1 = SYSCTL_NEG_ONE, .extra2 = SYSCTL_ONE, }, #endif @@ -878,7 +873,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = proc_dointvec_minmax_sysadmin, .extra1 = SYSCTL_ZERO, - .extra2 = , + .extra2 = SYSCTL_TWO, }, #endif { @@ -1189,7 +1184,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = perf_cpu_time_max_percent_handler, .extra1 = SYSCTL_ZERO, - .extra2 = _hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "perf_event_max_stack", @@ -1207,7 +1202,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, - .extra2 = _thousand, + .extra2 = SYSCTL_ONE_THOUSAND, }, #endif { @@ -1282,7 +1277,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = , + .extra2 = SYSCTL_TWO, }, { .procname = "panic_on_oom", @@ -1291,7 +1286,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = , + .extra2 = SYSCTL_TWO, }, { .procname = "oom_kill_allocating_task", @@ -1336,7 +1331,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = dirty_background_ratio_handler, .extra1 = SYSCTL_ZERO, - .extra2 = _hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_background_bytes
Re: [PATCH 2/4] proc/sysctl: add shared variables -1
On 2020/5/16 10:47, Kees Cook wrote: On Sat, May 16, 2020 at 10:32:19AM +0800, Xiaoming Ni wrote: On 2020/5/16 0:05, Kees Cook wrote: On Fri, May 15, 2020 at 05:06:28PM +0800, Xiaoming Ni wrote: On 2020/5/15 16:06, Kees Cook wrote: On Fri, May 15, 2020 at 12:33:42PM +0800, Xiaoming Ni wrote: Add the shared variable SYSCTL_NEG_ONE to replace the variable neg_one used in both sysctl_writes_strict and hung_task_warnings. Signed-off-by: Xiaoming Ni --- fs/proc/proc_sysctl.c | 2 +- include/linux/sysctl.h| 1 + kernel/hung_task_sysctl.c | 3 +-- kernel/sysctl.c | 3 +-- How about doing this refactoring in advance of the extraction patch? Before advance of the extraction patch, neg_one is only used in one file, does it seem to have no value for refactoring? I guess it doesn't matter much, but I think it's easier to review in the sense that neg_one is first extracted and then later everything else is moved. Later, when more features sysctl interface is moved to the code file, there will be more variables that need to be extracted. So should I only extract the neg_one variable here, or should I extract all the variables used by multiple features? Hmm -- if you're going to do a consolidation pass, then nevermind, I don't think order will matter then. Thank you for the cleanup! Sorry we're giving you back-and-forth advice! -Kees Sorry, I don't fully understand. Does this mean that there is no need to adjust the patch order or the order of variables in sysctl_vals? Should I extract only SYSCTL_NEG_ONE or should I extract all variables? Thanks Xiaoming Ni
Re: [PATCH 2/4] proc/sysctl: add shared variables -1
On 2020/5/16 0:05, Kees Cook wrote: On Fri, May 15, 2020 at 05:06:28PM +0800, Xiaoming Ni wrote: On 2020/5/15 16:06, Kees Cook wrote: On Fri, May 15, 2020 at 12:33:42PM +0800, Xiaoming Ni wrote: Add the shared variable SYSCTL_NEG_ONE to replace the variable neg_one used in both sysctl_writes_strict and hung_task_warnings. Signed-off-by: Xiaoming Ni --- fs/proc/proc_sysctl.c | 2 +- include/linux/sysctl.h| 1 + kernel/hung_task_sysctl.c | 3 +-- kernel/sysctl.c | 3 +-- How about doing this refactoring in advance of the extraction patch? Before advance of the extraction patch, neg_one is only used in one file, does it seem to have no value for refactoring? I guess it doesn't matter much, but I think it's easier to review in the sense that neg_one is first extracted and then later everything else is moved. Later, when more features sysctl interface is moved to the code file, there will be more variables that need to be extracted. So should I only extract the neg_one variable here, or should I extract all the variables used by multiple features? fs/proc/proc_sysctl.c | 2 +- include/linux/sysctl.h | 11 --- kernel/sysctl.c| 37 - 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b6f5d45..3f77e64 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -23,7 +23,7 @@ static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { 0, 1, INT_MAX }; +const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 1000, INT_MAX }; EXPORT_SYMBOL(sysctl_vals); /* Support for permanently empty directories */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 43f8ef9..bf97c30 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -38,9 +38,14 @@ struct ctl_dir; /* Keep the same order as in fs/proc/proc_sysctl.c */ -#define SYSCTL_ZERO((void *)_vals[0]) -#define SYSCTL_ONE ((void *)_vals[1]) -#define SYSCTL_INT_MAX ((void *)_vals[2]) +#define SYSCTL_NEG_ONE ((void *)_vals[0]) +#define SYSCTL_ZERO((void *)_vals[1]) +#define SYSCTL_ONE ((void *)_vals[2]) +#define SYSCTL_TWO ((void *)_vals[3]) +#define SYSCTL_FOUR((void *)_vals[4]) +#define SYSCTL_ONE_HUNDRED ((void *)_vals[5]) +#define SYSCTL_ONE_THOUSAND((void *)_vals[6]) +#define SYSCTL_INT_MAX ((void *)_vals[7]) extern const int sysctl_vals[]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5dd6d01..efe6172 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -118,14 +118,9 @@ /* Constants used for minimum and maximum */ -static int __maybe_unused neg_one = -1; -static int __maybe_unused two = 2; -static int __maybe_unused four = 4; static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; -static int one_hundred = 100; -static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 1; #endif @@ -534,7 +529,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = _one, + .extra1 = SYSCTL_NEG_ONE, .extra2 = SYSCTL_ONE, }, #endif @@ -865,7 +860,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = proc_dointvec_minmax_sysadmin, .extra1 = SYSCTL_ZERO, - .extra2 = , + .extra2 = SYSCTL_TWO, }, #endif { @@ -1043,7 +1038,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = perf_cpu_time_max_percent_handler, .extra1 = SYSCTL_ZERO, - .extra2 = _hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "perf_event_max_stack", @@ -1061,7 +1056,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, - .extra2 = _thousand, + .extra2 = SYSCTL_ONE_THOUSAND, }, #endif { @@ -1136,7 +1131,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = , + .extra2 = SYSCTL_TWO, }, { .procname = &qu