Re: [RFC PATCH 2/7] osdump: reuse some code from crash_core to get vmcoreinfo

2024-01-04 Thread Petr Mladek
Hi Qi,

first, most people, including me, prefer to be in Cc for the entire patchset.
It helps to get the whole picture.

This mail is even worse because the other patches are not in the same
thread. As a result, I can't find the other patches even via lore,
see https://lore.kernel.org/all/20231221132522.547-1-ruipeng...@gmail.com/


On Thu 2023-12-21 21:25:22, Ruipeng Qi wrote:
> From: qiruipeng 
> 
> Osdump is a new crash dumping solution like crash. It is interested in
> vmcoreinfo,too. Reuse some data and function from crash_core, but not full
> of them. So pick some code to get vmcoreinfo as needed.

> diff --git a/kernel/crash_core_mini.c b/kernel/crash_core_mini.c
> new file mode 100644
> index ..a0f8d0c79bba
> --- /dev/null
> +++ b/kernel/crash_core_mini.c
> @@ -0,0 +1,275 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * crash.c - kernel crash support code.
> + * Copyright (C) 2002-2004 Eric Biederman  
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +#include 
> +
> +#include 
> +
> +#include "kallsyms_internal.h"
> +#include "kexec_internal.h"
> +
> +/* Per cpu memory for storing cpu states in case of system crash. */
> +note_buf_t __percpu *crash_notes;
> +
> +/* vmcoreinfo stuff */
> +unsigned char *vmcoreinfo_data;
> +size_t vmcoreinfo_size;
> +u32 *vmcoreinfo_note;
> +
> +/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
> +static unsigned char *vmcoreinfo_data_safecopy;
> +
> +
> +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
> +   void *data, size_t data_len)
> +{
> + struct elf_note *note = (struct elf_note *)buf;
> +
> + note->n_namesz = strlen(name) + 1;
> + note->n_descsz = data_len;
> + note->n_type   = type;
> + buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
> + memcpy(buf, name, note->n_namesz);
> + buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
> + memcpy(buf, data, data_len);
> + buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
> +
> + return buf;
> +}
> +
> +void final_note(Elf_Word *buf)
> +{
> + memset(buf, 0, sizeof(struct elf_note));
> +}
> +
> +static void update_vmcoreinfo_note(void)
> +{
> + u32 *buf = vmcoreinfo_note;
> +
> + if (!vmcoreinfo_size)
> + return;
> + buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
> +   vmcoreinfo_size);
> + final_note(buf);
> +}
> +
> +void crash_update_vmcoreinfo_safecopy(void *ptr)
> +{
> + if (ptr)
> + memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
> +
> + vmcoreinfo_data_safecopy = ptr;
> +}
> +
> +void crash_save_vmcoreinfo(void)
> +{
> + if (!vmcoreinfo_note)
> + return;
> +
> + /* Use the safe copy to generate vmcoreinfo note if have */
> + if (vmcoreinfo_data_safecopy)
> + vmcoreinfo_data = vmcoreinfo_data_safecopy;
> +
> + vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds());
> + update_vmcoreinfo_note();
> +}
> +
> +void vmcoreinfo_append_str(const char *fmt, ...)
> +{
> + va_list args;
> + char buf[0x50];
> + size_t r;
> +
> + va_start(args, fmt);
> + r = vscnprintf(buf, sizeof(buf), fmt, args);
> + va_end(args);
> +
> + r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
> +
> + memcpy(_data[vmcoreinfo_size], buf, r);
> +
> + vmcoreinfo_size += r;
> +
> + WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES,
> +   "vmcoreinfo data exceeds allocated size, truncating");
> +}
> +
> +/*
> + * provide an empty default implementation here -- architecture
> + * code may override this
> + */
> +void __weak arch_crash_save_vmcoreinfo(void)
> +{}
> +
> +phys_addr_t __weak paddr_vmcoreinfo_note(void)
> +{
> + return __pa(vmcoreinfo_note);
> +}
> +EXPORT_SYMBOL(paddr_vmcoreinfo_note);
> +
> +int get_note_size(void)
> +{
> + return VMCOREINFO_NOTE_SIZE;
> +}
> +
> +static int __init crash_save_vmcoreinfo_init(void)
> +{
> + vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
> + if (!vmcoreinfo_data) {
> + pr_warn("Memory allocation for vmcoreinfo_data failed\n");
> + return -ENOMEM;
> + }
> +
> + vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
> + GFP_KERNEL | __GFP_ZERO);
> + if (!vmcoreinfo_note) {
> + free_page((unsigned long)vmcoreinfo_data);
> + vmcoreinfo_data = NULL;
> + pr_warn("Memory allocation for vmcoreinfo_note failed\n");
> + return -ENOMEM;
> + }
> +
> + VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
> + VMCOREINFO_BUILD_ID();
> + VMCOREINFO_PAGESIZE(PAGE_SIZE);
> +
> + VMCOREINFO_SYMBOL(init_uts_ns);
> + VMCOREINFO_OFFSET(uts_namespace, name);
> + 

Re: [PATCH 07/10] printk: Remove the now superfluous sentinel elements from ctl_table array

2023-12-06 Thread Petr Mladek
On Mon 2023-12-04 09:56:28, Joel Granados wrote:
> Hey Petr
> 
> I missed this message somehow
> 
> On Tue, Nov 28, 2023 at 03:07:43PM +0100, Petr Mladek wrote:
> > On Tue 2023-11-07 14:45:07, Joel Granados via B4 Relay wrote:
> > > From: Joel Granados 
> > > 
> > > This commit comes at the tail end of a greater effort to remove the
> > > empty elements at the end of the ctl_table arrays (sentinels) which
> > > will reduce the overall build time size of the kernel and run time
> > > memory bloat by ~64 bytes per sentinel (further information Link :
> > > https://lore.kernel.org/all/zo5yx5jfoggi%2f...@bombadil.infradead.org/)
> > > 
> > > rm sentinel element from printk_sysctls
> > > 
> > > Signed-off-by: Joel Granados 
> > 
> > I am a bit sceptical if the size and time reduction is worth the
> > effort. I feel that this change makes the access a bit less secure.
> In what way "less secure"? Can you expand on that?
> 
> Notice that if you pass a pointer to the register functions, you will
> get a warning/error on compilation.

I have vague memories that some arrays were not static or the length
has been somehow manipulated. But I might be wrong.

You are right that it should be safe with the static arrays.
And the NULL sentinel might be more error-prone after all.

Let's forget my mumbles.

Best Regards,
Petr



Re: [PATCH 07/10] printk: Remove the now superfluous sentinel elements from ctl_table array

2023-11-28 Thread Petr Mladek
On Tue 2023-11-07 14:45:07, Joel Granados via B4 Relay wrote:
> From: Joel Granados 
> 
> This commit comes at the tail end of a greater effort to remove the
> empty elements at the end of the ctl_table arrays (sentinels) which
> will reduce the overall build time size of the kernel and run time
> memory bloat by ~64 bytes per sentinel (further information Link :
> https://lore.kernel.org/all/zo5yx5jfoggi%2f...@bombadil.infradead.org/)
> 
> rm sentinel element from printk_sysctls
> 
> Signed-off-by: Joel Granados 

I am a bit sceptical if the size and time reduction is worth the
effort. I feel that this change makes the access a bit less secure.

Well, almost all arrays are static so that it should just work.
The patch does what it says. Feel free to use:

Reviewed-by: Petr Mladek 

Best Regards,
Petr



Re: [PATCH 08/11] sysctl: Add size to register_sysctl_init

2023-06-28 Thread Petr Mladek
On Thu 2023-06-22 16:00:21, Joel Granados wrote:
> On Thu, Jun 22, 2023 at 06:21:48AM +0200, Jiri Slaby wrote:
> > On 21. 06. 23, 15:15, Joel Granados wrote:
> > > On Wed, Jun 21, 2023 at 12:47:58PM +0200, Greg Kroah-Hartman wrote:
> > > > On Wed, Jun 21, 2023 at 11:09:57AM +0200, Joel Granados wrote:
> > > > >   static int __init random_sysctls_init(void)
> > > > >   {
> > > > > - register_sysctl_init("kernel/random", random_table);
> > > > > + register_sysctl_init("kernel/random", random_table,
> > > > > +  ARRAY_SIZE(random_table));
> > > > 
> > > > As mentioned before, why not just do:
> > > > 
> > > > #define register_sysctl_init(string, table) \
> > > > __register_sysctl_init(string, table, ARRAY_SIZE(table);
> > > Answered you in the original mail where you suggested it.
> > 
> > I am curious what that was, do you have a link?
> of course. I think you will find it here 
> https://lore.kernel.org/all/20230621123816.ufqbob6qthz4hujx@localhost/

Let me to copy the answer here:


I considered this at the outset, but it will not work with callers that
use a pointer instead of the actual array.
Additionally, we would not avoid big commits as we would have to go
looking in all the files where register is called directly or indirectly
and make sure the logic is sound.


For the callers using a pointer. A solution would be to create another
wrapper which would take the array size, e.g.

#define register_sysctl_init_limited(string, table, size)   \
__register_sysctl_init(string, table, size);


And ARRAY_SIZE() is defined in include/linux/kernel.h as:

#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))

It will create a compiler error when either an array[] or *array is
passed.

When using this:

1. The compiler will tell us where the other wrapper is needed.

2. Some locations might need the @size parameter even when a static
   array is passed. For example, neigh_sysctl_register() terminates
   the array early.

   But this will work when __register_sysctl_init() supports
   both ways.I mean that it will stop either on @size or empty
   element, as discussed in the other subthread.

   This should be caught when the final "empty" is removed
   from the particular caller.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH 08/11] sysctl: Add size to register_sysctl_init

2023-06-28 Thread Petr Mladek
On Wed 2023-06-21 11:09:57, Joel Granados wrote:
> In order to remove the end element from the ctl_table struct arrays, we
> explicitly define the size when registering the targes. We add a size
> argument to the register_sysctl_init call and pass an ARRAY_SIZE for all
> the callers.

This does not explain the motivatin why the end element is removed.

I agree with Jiri that saving 9k is a questionable gain. According to
the cover letter it saved 0,00%. It is because it saved 9k with allyes
config which produces huge kernel. IMHO, the 9k might be interesting
only for a tiny kernel. But I guess that it would safe much less
bytes there.

And the code with the added ARRAY_SIZE() parameter looks worse than before.

> diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c
> index c228343eeb97..28f37b86414e 100644
> --- a/kernel/printk/sysctl.c
> +++ b/kernel/printk/sysctl.c
> @@ -81,5 +81,6 @@ static struct ctl_table printk_sysctls[] = {
>  
>  void __init printk_sysctl_init(void)
>  {
> - register_sysctl_init("kernel", printk_sysctls);
> + register_sysctl_init("kernel", printk_sysctls,
> +  ARRAY_SIZE(printk_sysctls));
>  }

Is register_sysctl_init() still ready to handle the last empty element,
please? I am not in Cc on the related patches.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v4] panic: Fixes the panic_print NMI backtrace setting

2023-02-14 Thread Petr Mladek
On Fri 2023-02-10 17:35:10, Guilherme G. Piccoli wrote:
> Commit 8d470a45d1a6 ("panic: add option to dump all CPUs backtraces in 
> panic_print")
> introduced a setting for the "panic_print" kernel parameter to allow
> users to request a NMI backtrace on panic. Problem is that the panic_print
> handling happens after the secondary CPUs are already disabled, hence
> this option ended-up being kind of a no-op - kernel skips the NMI trace
> in idling CPUs, which is the case of offline CPUs.

Great catch!

> Hi folks, thanks in advance for reviews/comments.
> 
> Notice that while at it, I got rid of the "crash_kexec_post_notifiers"
> local copy in panic(). This was introduced by commit b26e27ddfd2a
> ("kexec: use core_param for crash_kexec_post_notifiers boot option"),
> but it is not clear from comments or commit message why this local copy
> is required.
> 
> My understanding is that it's a mechanism to prevent some concurrency,
> in case some other CPU modify this variable while panic() is running.
> I find it very unlikely, hence I removed it - but if people consider
> this copy needed, I can respin this patch and keep it, even providing a
> comment about that, in order to be explict about its need.

Yes, I think that it makes the behavior consistent even when the
global variable manipulated in parallel.

I would personally prefer to keep the local copy. Better safe
than sorry.

> Let me know your thoughts!
> Cheers,
> 
> Guilherme
> 
> 
>  kernel/panic.c | 47 +++
>  1 file changed, 27 insertions(+), 20 deletions(-)
> 
> diff --git a/kernel/panic.c b/kernel/panic.c
> index 463c9295bc28..f45ee88be8a2 100644
> --- a/kernel/panic.c
> +++ b/kernel/panic.c
> @@ -211,9 +211,6 @@ static void panic_print_sys_info(bool console_flush)
>   return;
>   }
>  
> - if (panic_print & PANIC_PRINT_ALL_CPU_BT)
> - trigger_all_cpu_backtrace();
> -

Sigh, this is yet another PANIC_PRINT_ action that need special
timing. We should handle both the same way.

What about the following? The parameter @mask says what
actions are allowed at the given time.

--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -72,6 +72,9 @@ EXPORT_SYMBOL_GPL(panic_timeout);
 #define PANIC_PRINT_FTRACE_INFO0x0010
 #define PANIC_PRINT_ALL_PRINTK_MSG 0x0020
 #define PANIC_PRINT_ALL_CPU_BT 0x0040
+/* Filter out actions that need special timing. */
+#define PANIC_PRINT_COMMON_INFO_MASK   ~(PANIC_PRINT_ALL_PRINTK_MSG |   \
+ PANIC_PRINT_ALL_CPU_BT)
 unsigned long panic_print;
 
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -203,30 +206,29 @@ void nmi_panic(struct pt_regs *regs, const char *msg)
 }
 EXPORT_SYMBOL(nmi_panic);
 
-static void panic_print_sys_info(bool console_flush)
+static void panic_print_sys_info(unsigned long mask)
 {
-   if (console_flush) {
-   if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
-   console_flush_on_panic(CONSOLE_REPLAY_ALL);
-   return;
-   }
+   unsigned long panic_print_now = panic_print & mask;
+
+   if (panic_print_now & PANIC_PRINT_ALL_PRINTK_MSG)
+   console_flush_on_panic(CONSOLE_REPLAY_ALL);
 
-   if (panic_print & PANIC_PRINT_ALL_CPU_BT)
+   if (panic_print_now & PANIC_PRINT_ALL_CPU_BT)
trigger_all_cpu_backtrace();
 
-   if (panic_print & PANIC_PRINT_TASK_INFO)
+   if (panic_print_now & PANIC_PRINT_TASK_INFO)
show_state();
 
-   if (panic_print & PANIC_PRINT_MEM_INFO)
+   if (panic_print_now & PANIC_PRINT_MEM_INFO)
show_mem(0, NULL);
 
-   if (panic_print & PANIC_PRINT_TIMER_INFO)
+   if (panic_print_now & PANIC_PRINT_TIMER_INFO)
sysrq_timer_list_show();
 
-   if (panic_print & PANIC_PRINT_LOCK_INFO)
+   if (panic_print_now & PANIC_PRINT_LOCK_INFO)
debug_show_all_locks();
 
-   if (panic_print & PANIC_PRINT_FTRACE_INFO)
+   if (panic_print_now & PANIC_PRINT_FTRACE_INFO)
ftrace_dump(DUMP_ALL);
 }
 
@@ -333,9 +335,12 @@ void panic(const char *fmt, ...)
 *
 * Bypass the panic_cpu check and call __crash_kexec directly.
 */
-   if (!_crash_kexec_post_notifiers) {
+   if (!_crash_kexec_post_notifiers)
__crash_kexec(NULL);
 
+   panic_print_sys_info(PANIC_PRINT_ALL_CPU_BT);
+
+   if (!_crash_kexec_post_notifiers) {
/*
 * Note smp_send_stop is the usual smp shutdown function, which
 * unfortunately means it may not be hardened to work in a
@@ -357,7 +362,7 @@ void panic(const char *fmt, ...)
 */
atomic_notifier_call_chain(_notifier_list, 0, buf);
 
-   panic_print_sys_info(false);
+   panic_print_sys_info(PANIC_PRINT_COMMON_INFO_MASK);
 
kmsg_dump(KMSG_DUMP_PANIC);
 
@@ -386,7 +391,7 @@ void panic(const char *fmt, ...)

Re: [PATCH] docs: gdbmacros: print newest record

2023-01-03 Thread Petr Mladek
On Thu 2022-12-29 14:49:39, John Ogness wrote:
> @head_id points to the newest record, but the printing loop
> exits when it increments to this value (before printing).
> 
> Exit the printing loop after the newest record has been printed.
> 
> The python-based function in scripts/gdb/linux/dmesg.py already
> does this correctly.
> 
> Fixes: e60768311af8 ("scripts/gdb: update for lockless printk ringbuffer")
> Cc: sta...@vger.kernel.org
> Signed-off-by: John Ogness 

JFYI, the patch has been committed into printk/linux.git,
branch for-6.3.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH] docs: gdbmacros: print newest record

2023-01-03 Thread Petr Mladek
On Thu 2022-12-29 14:49:39, John Ogness wrote:
> @head_id points to the newest record, but the printing loop
> exits when it increments to this value (before printing).
> 
> Exit the printing loop after the newest record has been printed.
> 
> The python-based function in scripts/gdb/linux/dmesg.py already
> does this correctly.
> 
> Fixes: e60768311af8 ("scripts/gdb: update for lockless printk ringbuffer")
> Cc: sta...@vger.kernel.org
> Signed-off-by: John Ogness 

Great catch!

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH V3 08/11] EDAC/altera: Skip the panic notifier if kdump is loaded

2022-12-12 Thread Petr Mladek
On Fri 2022-08-19 19:17:28, Guilherme G. Piccoli wrote:
> The altera_edac panic notifier performs some data collection with
> regards errors detected; such code relies in the regmap layer to
> perform reads/writes, so the code is abstracted and there is some
> risk level to execute that, since the panic path runs in atomic
> context, with interrupts/preemption and secondary CPUs disabled.
> 
> Users want the information collected in this panic notifier though,
> so in order to balance the risk/benefit, let's skip the altera panic
> notifier if kdump is loaded. While at it, remove a useless header
> and encompass a macro inside the sole ifdef block it is used.
> 
> Cc: Borislav Petkov 
> Cc: Petr Mladek 
> Cc: Tony Luck 
> Acked-by: Dinh Nguyen 
> Signed-off-by: Guilherme G. Piccoli 
> 
> ---
> 
> V3:
> - added the ack tag from Dinh - thanks!
> - had a good discussion with Boris about that in V2 [0],
> hopefully we can continue and reach a consensus in this V3.
> [0] 
> https://lore.kernel.org/lkml/46137c67-25b4-6657-33b7-cffdc7afc...@igalia.com/
> 
> V2:
> - new patch, based on the discussion in [1].
> [1] 
> https://lore.kernel.org/lkml/62a63fc2-346f-f375-043a-fa2138527...@igalia.com/
> 
> 
>  drivers/edac/altera_edac.c | 16 
>  1 file changed, 12 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c
> index e7e8e624a436..741fe5539154 100644
> --- a/drivers/edac/altera_edac.c
> +++ b/drivers/edac/altera_edac.c
> @@ -16,7 +16,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
>  #include 
>  #include 
>  #include 
> @@ -24,6 +23,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include "altera_edac.h"
> @@ -2063,22 +2063,30 @@ static const struct irq_domain_ops a10_eccmgr_ic_ops 
> = {
>  };
>  
>  /** Stratix 10 EDAC Double Bit Error Handler /
> -#define to_a10edac(p, m) container_of(p, struct altr_arria10_edac, m)
> -
>  #ifdef CONFIG_64BIT
>  /* panic routine issues reboot on non-zero panic_timeout */
>  extern int panic_timeout;
>  
> +#define to_a10edac(p, m) container_of(p, struct altr_arria10_edac, m)
> +
>  /*
>   * The double bit error is handled through SError which is fatal. This is
>   * called as a panic notifier to printout ECC error info as part of the 
> panic.
> + *
> + * Notice that if kdump is set, we take the risk avoidance approach and
> + * skip the notifier, given that users are expected to have access to a
> + * full vmcore.
>   */
>  static int s10_edac_dberr_handler(struct notifier_block *this,
> unsigned long event, void *ptr)
>  {
> - struct altr_arria10_edac *edac = to_a10edac(this, panic_notifier);
> + struct altr_arria10_edac *edac;
>   int err_addr, dberror;
>  
> + if (kexec_crash_loaded())
> + return NOTIFY_DONE;

I have read the discussion about v2 [1] and this looks like a bad
approach from my POV.

My understanding is that the information provided by this notifier
could not be found in the crashdump. It means that people really
want to run this before crashdump in principle.

Of course, there is the question how much safe this code is. I mean
if the panic() code path might get blocked here.

I see two possibilities.

The best solution would be if we know that this is "always" safe or if
it can be done a safe way. Then we could keep it as it is or implement
the safe way.

Alternative solution would be to create a kernel parameter that
would enable/disable this particular report when kdump is enabled.
The question would be the default. It would depend on how risky
the code is and how useful the information is.

[1] https://lore.kernel.org/r/20220719195325.402745-11-gpicc...@igalia.com

> + edac = to_a10edac(this, panic_notifier);
>   regmap_read(edac->ecc_mgr_map, S10_SYSMGR_ECC_INTSTAT_DERR_OFST,
>   );
>   regmap_write(edac->ecc_mgr_map, S10_SYSMGR_UE_VAL_OFST, dberror);

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH V4] notifier/panic: Introduce panic_notifier_filter

2022-03-08 Thread Petr Mladek
On Mon 2022-03-07 11:25:30, Guilherme G. Piccoli wrote:
> On 07/03/2022 11:04, b...@redhat.com wrote:
> > [...]
> > Ah, sorry, I even didn't notice that. That's awesome if we can make use
> > of that. While I still have concerns:
> > 
> 
> Thanks, nice that you liked the idea.
> 
> > 1) about those we have decided to take out from panic notifier list and
> > put before kdump, e.g the Hypver-V notifier, how will we do with it? Are
> > we going to handle them as we have discussed?
> > 
> 
> While implementing that I will think of something, but if
> understood/remember correctly Hyper-V gonna be one of the first to run
> in the first notifier list proposed by Petr - so we might still use
> ordering by priority there, having Hyper-V being the first heh

My understanding is that the problem is not a priority but an ordering
against other operations.

Namely, Hyper-V must be called even before crash dump. Some others before
kmsg_dump(). And the rest only when the crash dump is not called at all.


> > 2) Combing and settling priority for all existing panic notifier looks
> > great, even though it will take some effort. How about the later newly
> > added one? How can we guarantee that those new notifiers are getting
> > appropriate priority to mark their order? Sometime we even don't know
> > a new panic notifier is added since code change may be made in any
> > component or driver.
> > 
> 
> This is a great point! How to do it? One idea is to have a special
> registering function for panic notifiers that checks for priority field
> missing, and good documentation is a good idea as well, always.
> 
> But if you / others have other suggestions, let me know - appreciate that.
> Cheers,

Honestly, I am not that keen about enforcing the priorities.

It would make sense only when the ordering is really important.
Then there should be some rules. It should be obvious why
something has to be done earlier than something else.

>From my POV, it is just another complexity. Someone will need to
assign the priority to the existing notifiers. People will wonder
about it for newly added notifiers.

Reproducibility seems to be the only motivation. Is it really a
problem? Do the notifiers affect each other?

Also the notifiers are typically registered by some early boot
code. It will define some ordering out of box.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH V6] panic: Move panic_print before kmsg dumpers

2022-02-25 Thread Petr Mladek
On Fri 2022-02-25 14:18:22, Sergey Senozhatsky wrote:
> IOW move console_flush_on_panic() handling out of panic_print_sys_info().
> console_flush_on_panic() isn't really related to "print sys info" stuff
> that panic_print_sys_info() does.
> 
> Something like this may be:
> 
>  static void panic_print_sys_info(void)
>  {
> - if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
> - console_flush_on_panic(CONSOLE_REPLAY_ALL);
> -
>   if (panic_print & PANIC_PRINT_ALL_CPU_BT)
>   trigger_all_cpu_backtrace();
>  
> @@ -196,6 +193,23 @@ static void panic_print_sys_info(void)
>   ftrace_dump(DUMP_ALL);
>  }
>  
> +static void panic_console_flush(void)
> +{
> + /*
> +  * We may have ended up stopping the CPU holding the lock (in
> +  * smp_send_stop()) while still having some valuable data in the console
> +  * buffer.  Try to acquire the lock then release it regardless of the
> +  * result.  The release will also print the buffers out.  Locks debug
> +  * should be disabled to avoid reporting bad unlock balance when
> +  * panic() is not being callled from OOPS.
> +  */
> + debug_locks_off();
> + if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
> + console_flush_on_panic(CONSOLE_REPLAY_ALL);
> + else
> + console_flush_on_panic(CONSOLE_FLUSH_PENDING);
> +}
> +
>  /**
>   *   panic - halt the system
>   *   @fmt: The text string to print
> @@ -329,17 +343,7 @@ void panic(const char *fmt, ...)
>  #endif
>   console_unblank();
>  
> - /*
> -  * We may have ended up stopping the CPU holding the lock (in
> -  * smp_send_stop()) while still having some valuable data in the console
> -  * buffer.  Try to acquire the lock then release it regardless of the
> -  * result.  The release will also print the buffers out.  Locks debug
> -  * should be disabled to avoid reporting bad unlock balance when
> -  * panic() is not being callled from OOPS.
> -  */
> - debug_locks_off();
> - console_flush_on_panic(CONSOLE_FLUSH_PENDING);
> -
> + panic_console_flush();
>   panic_print_sys_info();
>  
>   if (!panic_blink)

The result looks good to me. We could do it as a followup patch.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH V6] panic: Move panic_print before kmsg dumpers

2022-02-24 Thread Petr Mladek
On Wed 2022-02-23 01:27:35, Sergey Senozhatsky wrote:
> On (22/02/22 11:10), Guilherme G. Piccoli wrote:
> > On 21/02/2022 23:06, Sergey Senozhatsky wrote:
> > > On (22/02/14 11:13), Guilherme G. Piccoli wrote:
> > > [...]
> > > By additional panic_print messages you mean that panic_print_sys_info()
> > > will print everything (except PANIC_PRINT_ALL_PRINTK_MSG) twice?
> > > 
> > > Do we really need to dump everything twice? show_mem(), show_state(),
> > > ftrace_dump(DUMP_ALL). That's quite a bit of extra data.
> > > 
> > 
> > Oh no, we don't print everything twice, that'd be insane heh
> 
> My bad! I did not spot the `return` at the end of the new branch.
> 
> +   if (console_flush) {
> +   if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
> +   console_flush_on_panic(CONSOLE_REPLAY_ALL);
> +   return;
> +   }
> 
> Hmm. Yeah, well, that's a bit of a tricky interface now
> 
>   panic()
>   // everything (if corresponding bits set), no console flush
>   panic_print_sys_info(false)
>   ...
>   // console flush only if corresponding bit set
>   panic_print_sys_info(true)

I agree that self-explaining names are always better than true/false.
It is pity that replay the log is handled in panic_print at all.

I sometimes hide these tricks into wrappers. We could rename:

panic_printk_sys_info() -> panic_print_handler()

and add wrappers:

void panic_print_sys_info()
{
panic_printk_handler(false);
}

void panic_print_log_replay()
{
panic_printk_handler(true);
}

Or just split panic_printk_sys_info() into these two functions.

> If everyone is fine then OK.
> 
> But I _personally_ would look into changing this to something like this:
> 
>   #define EARLY_PANIC_MASK (PANIC_PRINT_FOO | PANIC_PRINT_BAR | ...)
>   #define LATE_PANIC_MASK (PANIC_PRINT_ALL_PRINTK_MSG)

These lists cause merge and backporting conflicts. I vote to avoid
this approach ;-)


>   panic()
>   panic_print_sys_info(panic_print & EARLY_PANIC_MASK)
>   ...
>   panic_print_sys_info(panic_print & LATE_PANIC_MASK)

That said, I could live with the patch as is. I leave the decision
to Andrew. Feel free to use:

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH V6] panic: Move panic_print before kmsg dumpers

2022-02-15 Thread Petr Mladek
On Mon 2022-02-14 11:13:09, Guilherme G. Piccoli wrote:
> The panic_print setting allows users to collect more information in a
> panic event, like memory stats, tasks, CPUs backtraces, etc.
> This is an interesting debug mechanism, but currently the print event
> happens *after* kmsg_dump(), meaning that pstore, for example, cannot
> collect a dmesg with the panic_print extra information.
> 
> This patch changes that in 2 steps:
> 
> (a) The panic_print setting allows to replay the existing kernel log
> buffer to the console (bit 5), besides the extra information dump.
> This functionality makes sense only at the end of the panic() function.
> So, we hereby allow to distinguish the two situations by a new boolean
> parameter in the function panic_print_sys_info().
> 
> (b) With the above change, we can safely call panic_print_sys_info()
> before kmsg_dump(), allowing to dump the extra information when using
> pstore or other kmsg dumpers.
> 
> The additional messages from panic_print could overwrite the oldest
> messages when the buffer is full. The only reasonable solution is to
> use a large enough log buffer, hence we added an advice into the kernel
> parameters documentation about that.
> 
> Cc: Feng Tang 
> Cc: Petr Mladek 
> Acked-by: Baoquan He 
> Signed-off-by: Guilherme G. Piccoli 

Makes sense and looks good to me.

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH V4] notifier/panic: Introduce panic_notifier_filter

2022-01-28 Thread Petr Mladek
On Thu 2022-01-27 14:16:20, Guilherme G. Piccoli wrote:
> On 25/01/2022 10:06, d.hatay...@fujitsu.com wrote:
> > 
> > But the pre_dump cannot avoid calling multiple unnecessary handlers, right?
> > It's more risky than the previous idea...
> > 
> 
> I think we could have 2 kernel parameters then:
> 
> crash_kernel_disable_pre_notitifers (of course we can think in some
> better name here heh)
> 
> crash_kernel_enable_post_notifiers (which is the same as the current
> "crash_kernel_post_notifiers", we can keep it)
> 
> The point being (if I understand correctly): some callbacks are really
> simple and don't introduce risks for kdump, like the RCU; a bunch of
> them just set one variable. Those could be enable by default, before the
> kdump.
> 
> The majority would fit in the 2nd group, meaning they are not enabled by
> default, requiring some parameter for that.
> 
> Petr, let me know if that makes sense and is aligned with your suggestion.

First, I am sorry for the very long mail. But the problem is really
complicated. I did my best to describe it a clean way.

I have discussed these problems with a colleague and he had some good
points. And my view evolved even further.

There are two groups of people interested in panic() behavior:

1. Users wants to get as reliable as possible: kdump, kmsg_dump,
   console log, useful last message on screen, reboot, hypervisor
   notification.

   Different users have different priorities according to the use case.


2. Subsystem maintainers and developers that need to do something
   special in panic(). They have to deal with the user requirements
   and bug reports.

   Most operations in panic() have unclear results because the system
   is in unclear state. Maintainers and developers wants to make their
   life easier. They do not want to deal with problems caused by
   others. So that they want to disable others or run as early as
   possible.

   It is nicely visible. kdump maintainer is afraid of calling
   anything before kdump. Many people support the idea of filtering
   because it moves problems to the user side.


I see two basic problems here: ordering and reliability:

1. Ordering problems are partly solved by configuration and partly by
   definition. I mean that:

  + kdump, kmsg_dump, panic_print_sys_info() are optional
  + console output is the best effort; more risky in final flush
  + reboot, infinite loop are the very last step

   IMHO, the ordering should be pretty clear:

  + panic_print_sys_info(), kmsg_dump(), kdump(), console flush, reboot

   Why?

  + panic_print_sys_info(), kmsg_dump(), kdump() are all optional
   and disabled by default
  + Users might want panic_print_sys_info() in kmsg_dump() and kdump()
  + Users might prefer kmsg_dump() over kdump()
  + kdump() is the last operation when enabled

   Where are panic notifiers in this picture?
   Where are CPUs stopped?


2. Reliability is the best effort and any subsystem should do its
   best.

   Users need to be aware (documentation, warning) that:

  + kmsg_dump() is less reliable when panic_print_sys_info() is enabled
  + kdump() is less reliable when panic_print_sys_info() and/or
kmsg_dump() is enabled.

   Where are panic notifiers in this picture?
   How stopped CPUs affect reliability?


Regarding panic notifiers. They look like a problematic black box:

+ ordering against other operations is not clear
+ are not safe enough
+ various users require some and do not need others
+ some are too complex so that only few people know what
  they do


So far, we have two proposals how to handle panic notifiers:

1. Allow to filter them with parameter:

 + pros:
+ it allows users to customize and work around problems

 + cons:
+ ordering is still not clear

+ user has to know what he does; note that sometimes only
  few people know what their notifier does

+ hard to use in the long term; callbacks names are
  implementation detail; new notifiers are added

+ lower motivation to solve problems; easy to wave them with
  "just disable it when it does not work for you..."


2. Split notifiers into more lists:

+ pros:
+ might solve ordering problems

+ subsystem maintainers could find the proper and more safe
  location


+ cons:
+ subsystem maintainers tend to think that their problem is
  the most important one; they will tend to do the operation
  as early as possible; so that even dangerous operations
  might be done early  => the original problem is still there

+ it might not motivate developers enough to make the notifiers as
  safe as possible

+ some might still need to be optional; for example, it should
  be possible to disable hypervisor notifier when it breaks
  kdump


Regarding stopped CPUs, it looks like a good idea most of 

Re: [PATCH V4] notifier/panic: Introduce panic_notifier_filter

2022-01-26 Thread Petr Mladek
On Wed 2022-01-26 11:10:39, Baoquan He wrote:
> On 01/24/22 at 11:48am, Guilherme G. Piccoli wrote:
> > On 24/01/2022 10:59, Baoquan He wrote:
> > > [...]
> > > About pre_dump, if the dump is crash dump, hope those pre_dump notifiers
> > > will be executed under conditional check, e.g only if 
> > > 'crash_kexec_post_notifiers'
> > > is specified in kernel cmdline. 
> > 
> > Hi Baoquan, based on Petr's suggestion, I think pre_dump would be
> > responsible for really *non-intrusive/non-risky* tasks and should be
> > always executed in the panic path (before kdump), regardless of
> > "crash_kexec_post_notifiers".
> > 
> > The idea is that the majority of the notifiers would be executed in the
> > post_dump portion, and for that, we have the
> > "crash_kexec_post_notifiers" conditional. I also suggest we have
> > blacklist options (based on function names) for both notifiers, in order
> > to make kdump issues debug easier.
> > 
> > Do you agree with that? Feel free to comment with suggestions!
> > Cheers,
> 
> I would say "please NO" cautiously.
> 
> As Petr said, kdump mostly works only if people configure it correctly.
> That's because we try best to switch to kdump kernel from the fragile
> panicked kernel immediately. When we try to add anthing before the switching,
> please consider carefully and ask if that adding is mandatory, otherwise
> switching into kdump kernel may fail. If the answer is yes, the adding
> is needed and welcomed. Othewise, any unnecessary action, including any
> "non-intrusive/non-risky" tasks, would be unwelcomed.

I still do not have the complete picture. But it seems that some
actions make always sense even for kdump:

+ Super safe operations that might disable churn from broken
  system. For examle, disabling watchdogs by setting a single
  variable, see rcu_panic() notifier

+ Actions needed that allow to kexec the crash kernel a safe
  way under some hypervisor, see
  
https://lore.kernel.org/r/mwhpr21mb15933573f5c81c5250bf6a1cd7...@mwhpr21mb1593.namprd21.prod.outlook.com


> Surely, we don't oppose the "non-intrusive/non-risky" or completely
> "intrusive/risky" action adding before kdump kernel switching, with a
> conditional limitation. When we handle customers' kdump support, we
> explicitly declare we only support normal and default kdump operation.
> If any action which is done before switching into kdump kernel is specified,
> e.g panic_notifier, panic_print, they need take care of their own kdump
> failure.

All this actually started because of kmsg_dump. It might make sense to
allow both kmsg_dump and kdump together. The messages stored by
kmsg_dump might be better than nothing when kdump fails.

It actually seems to be the main motivation to introduce
"crash_kexec_post_notifier" parameter, see the commit
f06e5153f4ae2e2f3b03 ("kernel/panic.c: add "crash_kexec_post_notifiers"
option for kdump after panic_notifers").

And this patch introduces panic_notifier_filter that tries to select
notifiers that are helpful and harmful. IMHO, it is almost unusable.
It seems that even kernel developers do not understand what exactly
some notifiers do and why they are needed. Usually only the author
and people familiar with the subsystem have some idea. It makes
it pretty hard for anyone to create a reasonable filter.

I am pretty sure that we could do better. I propose to add more
notifier lists that will be called at various places with reasonable
rules and restrictions. Then the susbsystem maintainers could decide
where exactly a given action must be done.

The result might be that we will need only few options that will
enable/disable some well defined optional features.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH V3] panic: Move panic_print before kmsg dumpers

2022-01-26 Thread Petr Mladek
On Mon 2022-01-24 16:57:17, Michael Kelley (LINUX) wrote:
> From: Baoquan He  Sent: Friday, January 21, 2022 8:34 PM
> > 
> > On 01/21/22 at 03:00pm, Michael Kelley (LINUX) wrote:
> > > From: Baoquan He  Sent: Thursday, January 20, 2022 6:31 
> > > PM
> > > >
> > > > On 01/20/22 at 06:36pm, Guilherme G. Piccoli wrote:
> > > > > Hi Baoquan, some comments inline below:
> > > > >
> > > > > On 20/01/2022 05:51, Baoquan He wrote:
> 
> [snip]
> 
> > > > > Do you think it should be necessary?
> > > > > How about if we allow users to just "panic_print" with or without the
> > > > > "crash_kexec_post_notifiers", then we pursue Petr suggestion of
> > > > > refactoring the panic notifiers? So, after this future refactor, we
> > > > > might have a much clear code.
> > > >
> > > > I haven't read Petr's reply in another panic notifier filter thread. For
> > > > panic notifier, it's only enforced to use on HyperV platform, excepto of
> > > > that, users need to explicitly add "crash_kexec_post_notifiers=1" to 
> > > > enable
> > > > it. And we got bug report on the HyperV issue. In our internal 
> > > > discussion,
> > > > we strongly suggest HyperV dev to change the default enablement, instead
> > > > leave it to user to decide.
> > > >
> > >
> > > Regarding Hyper-V:   Invoking the Hyper-V notifier prior to running the
> > > kdump kernel is necessary for correctness.  During initial boot of the
> > > main kernel, the Hyper-V and VMbus code in Linux sets up several guest
> > > physical memory pages that are shared with Hyper-V, and that Hyper-V
> > > may write to.   A VMbus connection is also established. Before kexec'ing
> > > into the kdump kernel, the sharing of these pages must be rescinded
> > > and the VMbus connection must be terminated.   If this isn't done, the
> > > kdump kernel will see strange memory overwrites if these shared guest
> > > physical memory pages get used for something else.
> 
> In the Azure cloud, collecting data before crash dumps is a motivation
> as well for setting crash_kexec_post_notifiers to true.   That way as
> cloud operator we can see broad failure trends, and in specific cases
> customers often expect the cloud operator to be able to provide info
> about a problem even if they have taken a kdump.  Where did you
> envision adding a comment in the code to help clarify these intentions?
> 
> I looked at the code again, and should revise my previous comments
> somewhat.   The Hyper-V resets that I described indeed must be done
> prior to kexec'ing the kdump kernel.   Most such resets are actually
> done via __crash_kexec() -> machine_crash_shutdown(), not via the
> panic notifier. However, the Hyper-V panic notifier must terminate the
> VMbus connection, because that must be done even if kdump is not
> being invoked.  See commit 74347a99e73.
>
> Most of the hangs seen in getting into the kdump kernel on Hyper-V/Azure 
> were probably due to the machine_crash_shutdown() path, and not due
> to running the panic notifiers prior to kexec'ing the kdump kernel.  The
> exception is terminating the VMbus connection, which had problems that
> are hopefully now fixed because of adding a timeout.

My undestanding is that we could split the actions into three groups:

  1. Actions that has to be before kexec'ing kdump kernel, like
 resetting physicall memory shared with Hyper-V.

 These operation(s) are needed only for kexec and can be done
 in kexec.


   2. Notify Hyper-V so that, for example, Azure cloud, could collect
  data before crash dump.

  It is nice to have.

  It should be configurable if it is not completely safe. I mean
  that there should be a way to disable it when it might increase
  the risk that kexec'ing kdump kernel might fail.


   3. Some actions are needed only when panic() ends up in the
  infinite loop.

  For example, unloading vmbus channel. At least the commit
  74347a99e73ae00b8385f ("x86/Hyper-V: Unload vmbus channel in
  hv panic callback") says that it is done in kdump path
  out of box.

All these operations are needed and used only when the kernel is
running under Hyper-V.

My mine intention is to understand if we need 2 or 3 notifier lists
or the current one is enough.

The 3 notifier lists would be:

   + always do (even before kdump)
   + optionally do before or after kdump
   + needed only when kdump is not called

Thanks a lot for the very valuable input.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH V4] notifier/panic: Introduce panic_notifier_filter

2022-01-20 Thread Petr Mladek
Adding some more people into Cc. Some modified the logic in the past.
Some are familiar with some interesting areas where the panic
notfiers are used.

On Sat 2022-01-08 12:34:51, Guilherme G. Piccoli wrote:
> The kernel notifier infrastructure allows function callbacks to be
> added in multiple lists, which are then called in the proper time,
> like in a reboot or panic event. The panic_notifier_list specifically
> contains the callbacks that are executed during a panic event. As any
> other notifier list, the panic one has no filtering and all functions
> previously registered are executed.
> 
> The kdump infrastructure, on the other hand, enables users to set
> a crash kernel that is kexec'ed in a panic event, and vmcore/logs
> are collected in such crash kernel. When kdump is set, by default
> the panic notifiers are ignored - the kexec jumps to the crash kernel
> before the list is checked and callbacks executed.
> 
> There are some cases though in which kdump users might want to
> allow panic notifier callbacks to execute _before_ the kexec to
> the crash kernel, for a variety of reasons - for example, users
> may think kexec is very prone to fail and want to give a chance
> to kmsg dumpers to run (and save logs using pstore),

Yes, this seems to be original intention for the
"crash_kexec_post_notifiers" option, see the commit
f06e5153f4ae2e2f3b0300f ("kernel/panic.c: add
"crash_kexec_post_notifiers" option for kdump after panic_notifiers")


> some panic notifier is required to properly quiesce some hardware
> that must be used to the crash kernel.

Do you have any example, please? The above mentioned commit
says "crash_kexec_post_notifiers" actually increases risk
of kdump failure.

Note that kmsg_dump() is called after the notifiers only because
some are printing more information, see the commit
6723734cdff15211bb78a ("panic: call panic handlers before kmsg_dump").
They might still increase the change that kmsg_dump() will never
be called.


> But there's a problem: currently it's an "all-or-nothing" situation,
> the kdump user choice is either to execute all panic notifiers or
> none of them. Given that panic notifiers may increase the risk of a
> kdump failure, this is a tough decision and may affect the debug of
> hard to reproduce bugs, if for some reason the user choice is to
> enable panic notifiers, but kdump then fails.
>
> So, this patch aims to ease this decision: we hereby introduce a filter
> for the panic notifier list, in which users may select specifically
> which callbacks they wish to run, allowing a safer kdump. The allowlist
> should be provided using the parameter "panic_notifier_filter=a,b,..."
> where a, b are valid callback names. Invalid symbols are discarded.

I am afraid that this is almost unusable solution:

   + requires deep knowledge of what each notifier does
   + might need debugging what notifier causes problems
   + the list might need to be updated when new notifiers are added
   + function names are implementation detail and might change
   + requires kallsyms


It is only workaround for a real problem. The problem is that
"panic_notifier_list" is used for many purposes that break
each other.

I checked some notifiers and found few groups:

   + disable watchdogs:
  + hung_task_panic()
  + rcu_panic()

   + dump information:
  + kernel_offset_notifier()
  + trace_panic_handler() (duplicate of panic_print=0x10)

   + inform hypervisor
  + xen_panic_event()
  + pvpanic_panic_notify()
  + hyperv_panic_event()

   + misc cleanup / flush / blinking
  + panic_event()   in ipmi_msghandler.c
  + panic_happened()   in heartbeat.c
  + led_trigger_panic_notifier()


IMHO, the right solution is to split the callbacks into 2 or more
notifier list. Then we might rework panic() to do:

void panic(void)
{
[...]

/* stop watchdogs + extra info */
atomic_notifier_call_chain(_disable_watchdogs_notifier_list, 0, 
buf);
atomic_notifier_call_chain(_info_notifier_list, 0, buf);
panic_print_sys_info();

/* crash_kexec + kmsg_dump in configurable order */
if (!_crash_kexec_post_kmsg_dump) {
__crash_kexec(NULL);
smp_send_stop();
} else {
crash_smp_send_stop();
}

kmsg_dump();
if (_crash_kexec_post_kmsg_dump)
__crash_kexec(NULL);

/* infinite loop or reboot */
atomic_notifier_call_chain(_hypervisor_notifier_list, 0, buf);
atomic_notifier_call_chain(_rest_notifier_list, 0, buf);

console_flush_on_panic(CONSOLE_FLUSH_PENDING);

if (panic_timeout >= 0) {
timeout();
emergency_restart();
}

for (i = 0; ; i += PANIC_TIMER_STEP) {
if (i >= i_next) {
i += panic_blink(state ^= 1);
i_next = i + 3600 / PANIC_BLINK_SPD;
}

Re: [PATCH V3] panic: Move panic_print before kmsg dumpers

2022-01-20 Thread Petr Mladek
On Wed 2022-01-19 13:03:15, Guilherme G. Piccoli wrote:
> Thanks again Petr, for the deep analysis! Much appreciated.
> Some comments inline below:
> 
> 
> On 19/01/2022 12:48, Petr Mladek wrote:
> >[...]
> > From my POV, the function of panic notifiers is not well defined. They
> > do various things, for example:
> > [...] 
> > 
> > The do more that just providing information. Some are risky. It is not
> > easy to disable a particular one.
> 
> We are trying to change that here:
> https://lore.kernel.org/lkml/20220108153451.195121-1-gpicc...@igalia.com/
> 
> Your review/comments are very welcome =)
> 
> 
> > [...] 
> > It might make sense to allow to call kmsg_dump before panic notifiers
> > to reduce the risk of a breakage. But I do not have enough experience
> > with them to judge this.
> > 
> > I can't remember any bug report in this code. I guess that only
> > few people use kmsg_dump.
> 
> One of the problems doing that is that RCU and hung task detector, for
> example, have panic notifiers to disable themselves in the panic
> scenario - if we kmsg_dump() _before_ the panic notifiers, we may have
> intermixed messages, all messy...so for me it makes sense to keep the
> kmsg_dump() after panic notifiers.

It makes perfect sense to disable the watchdogs during panic().
For example, rcu_panic() just sets a variable:

static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
{
rcu_cpu_stall_suppress = 1;
return NOTIFY_DONE;
}

It is quick and super-safe. It might make sense to implement similar
thing for other watchdogs and do something like:

void panic_supress_watchdogs(void)
{
rcu_panic();
softlockup_watchog_panic();
wq_watchog_panic();
}

It might be caller early in panic().

> 
> > [...]
> > Yes, panic_print_sys_info() increases the risk that the crash dump
> > will not succeed. But the change makes sense because:
> > 
> >   + panic_print_sys_info() might be useful even with full crash dump.
> > For example, ftrace messages are not easy to read from the memory
> > dump.
> 
> The last point is really good, I didn't consider that before but makes a
> lot of sense - we can now dump (a hopefully small!) ftrace/event trace
> buffer to dmesg before a kdump, making it pretty easy to read that later.
> Cheers,

JFYI, there is an extension for the crash tool that might be able to read
the trace log, see https://crash-utility.github.io/extensions.html

I haven't tested it myself yet.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH V3] panic: Move panic_print before kmsg dumpers

2022-01-19 Thread Petr Mladek
On Fri 2022-01-14 15:30:46, Guilherme G. Piccoli wrote:
> The panic_print setting allows users to collect more information in a
> panic event, like memory stats, tasks, CPUs backtraces, etc.
> This is a pretty interesting debug mechanism, but currently the print
> event happens *after* kmsg_dump(), meaning that pstore, for example,
> cannot collect a dmesg with the panic_print information.

The above text is OK.

The below commit message is quite hard to follow. The sentences are very
long and complex. I tend to do the same mistakes. I will try to
suggest some improvements.


> This patch changes that in 2 ways:

> (a) First, after a good discussion with Petr in the mailing-list[0],
> he noticed

The above is not needed ;-) It is better to just describe the problem
and the solution a directive way.

> that one specific setting of panic_print (the console replay,
> bit 5) should not be executed before console proper flushing; hence we
> hereby split the panic_print_sys_info() function in upper and lower
> portions: if the parameter "after_kmsg_dumpers" is passed, only bit 5
> (the console replay thing) is evaluated and the function returns - this
> is the lower portion. Otherwise all other bits are checked and the
> function prints the user required information; this is the upper/earlier
> mode.

The meaning of the boolean parameter might be explained by:

"panic_print_sys_info() allows to print additional information into the
kernel log. In addition, it allows to reply the existing log on the
console first.

The extra information is useful also when generating crash dump or
kmsg_dump. But the replay functionality makes sense only at the end
of panic().

Allow to distinguish the two situations by a boolean parameter."



> (b) With the above change, we can safely insert a panic_print_sys_info()
> call up some lines, in order kmsg_dump() accounts this new information
> and exposes it through pstore or other kmsg dumpers. Notice that this
> new earlier call doesn't set "after_kmsg_dumpers" so we print the
> information set by user in panic_print, except the console replay that,
> if set, will be executed after the console flushing.

This paragraph is too complicated. The important information here is:

 "panic_print_sys_info() is moved above kmsg_dump(). It allows to dump
 the extra information."

The trick with the boolean is already explained elsewhere. It is not
needed to repeat it in this paragraph.


> Also, worth to notice we needed to guard the panic_print_sys_info(false)
> calls against double print - we use kexec_crash_loaded() helper for that
> (see discussion [1] for more details).

This should be explained together with the reason to call it on two
locations, see below.


> In the first version of this patch we discussed the risks in the
> mailing-list [0], and seems this approach is the least terrible,

the above is not needed.

> despite still having risks of polluting the log with the bulk of
> information that panic_print may bring, losing older messages.
> In order to attenuate that, we've added a warning in the
> kernel-parameters.txt so that users enabling this mechanism consider
> to increase the log buffer size via "log_buf_len" as well.

Again, I would describe the problem and solution a directive way.

"The additional messages might overwrite the oldest messages when
 the buffer is full. The only reasonable solution is to use large
 enough log buffer. The advice is added into kernel-parameters.txt."



> Finally, another decision was to keep 2 panic_print_sys_info(false)
> calls (instead of just bringing it up some code lines and keep a single
> call) due to the panic notifiers:

The above is too complicated and not important.

> if kdump is not set, currently the
> panic_print information is collected after the notifiers and since
> it's a bit safer this way,

The important information is why it is safer. Honestly, I am still not sure
about this claim. I have checked several notifiers today and they did
not added much stability.


> we decided to keep it as is, only modifying
> the kdump case as per the previous commit [2] (see more details about
> this discussion also in thread [1]).

This does not provide much information. Link to linux-next is bad
idea, see below.


> [0] https://lore.kernel.org/lkml/20211230161828.121858-1-gpicc...@igalia.com
> [1] 
> https://lore.kernel.org/lkml/f25672a4-e4dd-29e8-b2db-f92dd9ff9...@igalia.com
> [2] 
> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=5613b7538f69

linux-next is regularly rebased and Andrew's patches are always newly
imported from quilt. The commit probably already has another hash.

> 
> Cc: Feng Tang 
> Cc: Petr Mladek 
> Signed-off-by: Guilher

Re: [PATCH V3] panic: Move panic_print before kmsg dumpers

2022-01-19 Thread Petr Mladek
On Wed 2022-01-19 15:13:18, Baoquan He wrote:
> On 01/14/22 at 03:30pm, Guilherme G. Piccoli wrote:
> > The panic_print setting allows users to collect more information in a
> > panic event, like memory stats, tasks, CPUs backtraces, etc.
> > This is a pretty interesting debug mechanism, but currently the print
> > event happens *after* kmsg_dump(), meaning that pstore, for example,
> > cannot collect a dmesg with the panic_print information.
> .. 
> 
> Thanks for the effort.
>   
> 
> I have some concerns about this patch, and it's still not clear to me
> why the thing done in this patch is needed. Hope you don't mind if I ask
> stupid question or things have been discussed in the earlier post.
> 
> Firstly, let me add some background information and details about the
> problem with my understanding, please correct me if anthing is wrong.
> 
> Background:
> ===
> Currently, in panic(), there are 5 different ways to try best to
> collect information:
> 1) Vmcore dumping
>done via kdump, high weight, almost get all information we want;
>need switch to kdump kernel immediately.

My experience is that it basically always works when it is correctly
configured. It might be tested before the crash.



> 2) Panic notifier
>iterating those registered handlers, light weight, information got
>depends on those panic handlers. Try its best after panic, do not reboot.


>From my POV, the function of panic notifiers is not well defined. They
do various things, for example:

  + on_panic_nb() flushes s390 console buffers. It looks like a
complex and risky code. For example, __sclp_vt220_flush_buffer()
takes spin locks and calls timer code.

  + dump_gisb_error() prints some info but it also communicates with
the device, see gisb_read() and gisb_write() in
brcmstb_gisb_arb_decode_addr(). I am not sure how reliable this is.

  + parisc_panic_event() re-enables the soft-power switch. I am not
sure how safe this is.

  + pvpanic_panic_notify() takes spin lock and does in iowrite().


The do more that just providing information. Some are risky. It is not
easy to disable a particular one.


> 3) kmsg_dump
>iterating registered dumpers to collect as much kernel log as possible
>to store. E.g on pstore, light weight, only kernel log, do not reboot,
>log can be checked after system reboot.

>From my POV, it is similar functionality like the crash dump.

It might make sense to allow to call kmsg_dump before panic notifiers
to reduce the risk of a breakage. But I do not have enough experience
with them to judge this.

I can't remember any bug report in this code. I guess that only
few people use kmsg_dump.


> 4)console_flush_on_panic(CONSOLE_FLUSH_PENDING);
>Flush to get the pending logbuf printed out to console.

My experience is that there are many problems with consoles.
There might be deadlocks because of internal locks. Some
serial consoles are incredibly slow. Messages in the console
drivers might cause infinite output. Some drivers are pretty
complex, especially tty that is often enabled.

console_flush_on_panic() makes it even worse. It resets console_sem.
It means that it tries to use the consoles even when they are
already in use.

Note that printk() tries to show the messages on the console
immediately. console_flush_on_panic() is just the last
instance when all safe ways failed.


> 5)panic_print
>Print out more system info, including all dmesg, task states, mem
>info, etc.

This adds more information to the kernel log. They can be requested
by kernel commandline parameter. They can be seen only on the console
at the moment.

It makes sense to allow to see them also in crash dump or kmsg_dump.

> About the change, my question is why not packing
> console_flush_on_panic(CONSOLE_FLUSH_PENDING) into panic_print(), and
> move panic_print() up to be above kmsg_dump().

As explained above. console_flush_on_panic() is a dirty hack.
It really has to be called at the end as the last resort.


> And adding panic_print_sys_info(false) before kdump might not be a good
> idea. Firstly, panicked system is very unstable, kdump need switch to
> 2nd kernel asap w/o any extra action so that it can survive and collect
> tons of information. Added panic_print_sys_info(false) is obviously
> unnecessary to kdump, and increase risk to kernel switching.

Yes, panic_print_sys_info() increases the risk that the crash dump
will not succeed. But the change makes sense because:

  + panic_print_sys_info() does nothing by default. Most users will
not enable it together with crash dump.

  + Guilherme uses crash dump only to dump the kernel log. It might
be more reliable than kmsg_dump. In this case, panic_print_sys_info()
is the only way to get the extra information.

  + panic_print_sys_info() might be useful even with full crash dump.
For example, 

Re: [PATCH V2] panic: Move panic_print before kmsg dumpers

2022-01-14 Thread Petr Mladek
On Thu 2022-01-13 12:15:08, Guilherme G. Piccoli wrote:
> On 13/01/2022 11:22, Petr Mladek wrote:
> > [...]
> > OK, do we have any specific reason why panic_print_sys_info()
> > should get called right before kmsg_dump() when this code patch
> > is used?
> > 
> > Alternative solution would be to remove the check of
> > kexec_crash_loaded() and always call panic_print_sys_info(false)
> > at the beginning (after kgdb_panic(buf)).
> > 
> > The advantage is that panic_print_sys_info(false) will be always
> > called on the same location. It will give the same results
> > in all code paths so that it will be easier to interpret them.
> > And it will have the same problems so it should be easier
> > to debug and maintain.
> > 
> > It is possible that it will not work for some users. Also it is
> > possible that it might cause some problems. But it is hard to
> > guess at least for me.
> > 
> > I think that we might try it and see if anyone complains.
> > Honestly, I think that only few people use panic_printk_sys_info().
> > And your use-case makes sense.
> > 
> > Best Regards,
> > Petr
> 
> Hi Petr, thanks for your idea - it's simple and effective, would resolve
> the problems in a straightforward way. But there are some cons, let me
> detail more.
> 
> Currently (in linux-next), if users set panic_print but not kdump, the
> panic_print_sys_info() is called *after* the panic notifiers. If user
> has kdump configured and still sets panic_print (which is our use case),
> then panic_print_sys_info() is called _before_ the panic notifiers. In
> other words, the behavior for non-kdump users is the same as before,
> since the merge of panic_print functionality.
> 
> Your idea brings the printing earlier, always before the panic
> notifiers. This isn't terrible, but might lead to unfortunate and
> hard-to-debug problems; for example, some panic notifiers are
> rcu_panic() and hung_task_panic(), both are simple functions to disable
> RCU warnings and hung task detector in panic scenarios. If we go with
> your idea, these won't get called before panic_print_sys_info(), even if
> kdump is not set. So, since the panic_print triggers a lot of printing
> in the console, we could face a stall and trigger RCU messages, maybe
> intermixed with the panic_print information.

I see. OK, it makes sense to call it after the panic notifiers when
they are used. It would be nice to mention the above in the commit
message and explain why the 2nd call is there.

Just an idea. It might be better to move the 1st call below
if (!_crash_kexec_post_notifiers). It would make it more
clear that it is intended for this code path. I mean:

if (!_crash_kexec_post_notifiers) {
/* ... */
if (kexec_crash_loaded())
panic_print_sys_info(false);

__crash_kexec(NULL);
...

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH V2] panic: Move panic_print before kmsg dumpers

2022-01-13 Thread Petr Mladek
On Thu 2022-01-13 09:34:01, Guilherme G. Piccoli wrote:
> On 13/01/2022 08:50, Petr Mladek wrote:
> >> @@ -249,7 +252,7 @@ void panic(const char *fmt, ...)
> >> * show some extra information on kernel log if it was set...
> >> */
> >>if (kexec_crash_loaded())
> >> -  panic_print_sys_info();
> >> +  panic_print_sys_info(false);
> > 
> > panic_print_sys_info(false) will be called twice when both
> > kexec_crash_loaded() and _crash_kexec_post_notifiers are true.
> > 
> > Do we really need to call panic_print_sys_info() here? All information
> > provided by panic_print_sys_info(false) can be found also in
> > the crash dump.
> > 
> >>/*
> >> * If we have crashed and we have a crash kernel loaded let it handle
> >> @@ -283,6 +286,8 @@ void panic(const char *fmt, ...)
> >> */
> >>atomic_notifier_call_chain(_notifier_list, 0, buf);
> >>  
> >> +  panic_print_sys_info(false);
> > 
> > This is where the info might be printed 2nd time.
> > 
> >> +
> >>kmsg_dump(KMSG_DUMP_PANIC);

> Thanks for catching this issue - indeed, if
> "_crash_kexec_post_notifiers" is true, with this patch we print stuff
> twice. I will submit a V3 that guards against that, using a bool, makes
> sense to you?

It might be possible to check kexec_crash_loaded() on the two
locations. But I think about even easier solution, see below.


> The interesting question here is:
> > Do we really need to call panic_print_sys_info() here? All information
> > provided by panic_print_sys_info(false) can be found also in
> > the crash dump.
> 
> So, we indeed need that in our use case. Crash is meant to be used
> post-mortem, i.e., you made a full vmcore collection and then, of
> course, you have basically all the data you need accessible though the
> crash tool.
> 
> Problem is: in our use case, we want more data than a regular dmesg in a
> panic event (hence we use panic_print), but we don't collect a full
> crash dump, due to its big size. Also, as you can imagine, we do favor
> pstore over kdump, but it might fail due to a variety of reasons (like
> not having a free RAM buffer for ramoops), so kdump is our fallback.
> Hence, we'd like to be able to use panic_print with both kdump and
> pstore, and for that, both patches are needed.

Fair enough.

OK, do we have any specific reason why panic_print_sys_info()
should get called right before kmsg_dump() when this code patch
is used?

Alternative solution would be to remove the check of
kexec_crash_loaded() and always call panic_print_sys_info(false)
at the beginning (after kgdb_panic(buf)).

The advantage is that panic_print_sys_info(false) will be always
called on the same location. It will give the same results
in all code paths so that it will be easier to interpret them.
And it will have the same problems so it should be easier
to debug and maintain.

It is possible that it will not work for some users. Also it is
possible that it might cause some problems. But it is hard to
guess at least for me.

I think that we might try it and see if anyone complains.
Honestly, I think that only few people use panic_printk_sys_info().
And your use-case makes sense.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH V2] panic: Move panic_print before kmsg dumpers

2022-01-13 Thread Petr Mladek
On Thu 2022-01-06 18:28:35, Guilherme G. Piccoli wrote:
> The panic_print setting allows users to collect more information in a
> panic event, like memory stats, tasks, CPUs backtraces, etc.
> This is a pretty interesting debug mechanism, but currently the print
> event happens *after* kmsg_dump(), meaning that pstore, for example,
> cannot collect a dmesg with the panic_print information.

> --- a/kernel/panic.c
> +++ b/kernel/panic.c
> @@ -249,7 +252,7 @@ void panic(const char *fmt, ...)
>* show some extra information on kernel log if it was set...
>*/
>   if (kexec_crash_loaded())
> - panic_print_sys_info();
> + panic_print_sys_info(false);

panic_print_sys_info(false) will be called twice when both
kexec_crash_loaded() and _crash_kexec_post_notifiers are true.

Do we really need to call panic_print_sys_info() here? All information
provided by panic_print_sys_info(false) can be found also in
the crash dump.

>   /*
>* If we have crashed and we have a crash kernel loaded let it handle
> @@ -283,6 +286,8 @@ void panic(const char *fmt, ...)
>*/
>   atomic_notifier_call_chain(_notifier_list, 0, buf);
>  
> + panic_print_sys_info(false);

This is where the info might be printed 2nd time.

> +
>   kmsg_dump(KMSG_DUMP_PANIC);
>  
>   /*

Otherwise, the change makes sense to me.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH] panic: Move panic_print before kmsg dumpers

2022-01-04 Thread Petr Mladek
On Mon 2022-01-03 13:30:35, Guilherme G. Piccoli wrote:
> Hi Petr, first of all, I'd like to thank you for the _very_ complete and
> informative response, pretty useful!
> Some comments inline below:
> 
> 
> On 03/01/2022 11:58, Petr Mladek wrote:
> > [...]
> > I see the point. Unfortunately, there is some risk in this change
> > so I have to ask:
> > 
> > Did you miss this behavior in practice?
> > Or does it just look like something that might be useful?
> 
> Heh it's a valid question! And the answer is that indeed, we miss this
> behavior in practice - the plan is to use pstore as panic "fast" log
> collection (instead of kdump), and to print some extra info through
> panic_print. Hence the surprise when that information wasn't properly
> stored in the pstore saved logs...

Good to know.

> > [...] 
> > Note that panic_print_sys_info() might produce a lot of lines. There is
> > non-trivial chance that a lot of information might get lost because
> > of the limited log buffer size.
> > 
> > It might cause regression even when no dumpers are registered and
> > crash kernel is not loaded. panic_print_sys_info() might overwrite
> > the existing messages before we try hard to flush them via
> > console_flush_on_panic().
> 
> Sure, we already account for this risk, and make use of "log_buf_len" to
> increase dmesg size, something like 4M/8M.
> 
> Now, the second portion of your statement is a bit frightening! Maybe a
> silly question, but the risk is to lose some messages in the *dmesg* or
> only in the output console, like a serial/graphic console?
> 
> If the risk is to lose messages even in the dmesg (that is going to be
> collected through pstore/any other kmsg dumper),

To make it clear. panic_printk_sys_info() might overwrite the oldest
messages stored on the log buffer. This patch moves
panic_print_sys_info() before kmsg_dump(). It means that even
kmsg_dump() would not see the overwritten oldest messages.


> do you think would make sense to have a second (in fact, first!)
> "console_flush_on_panic(...)" _before_ "panic_print_sys_info()"?
> What would be the cons in this approach? I understand that we may need
> to (at least) disable debug_locks earlier than it's currently done, for
> example. If kexec is more risky having this earlier flush, we could
> flush conditionally on not having a crash kernel loaded.

Good question. I am familiar only with the console problems and there
are many problems. Serial consoles might be slow. All consoles use
internal locks that might cause deadlocks. More complicated consoles
are even more risky.

My experience is that kexec mostly works when there is enough reserved
space for booting the crash kernel. And it can be tested before
the crash.

So, I personally think that console_flush_on_panic() is more risky
and should not get called before kexec().

Also kexec might be simply disabled when it does not work. But
console_flush_on_panic() could not be disabled if we call it
too early. We might make it configurable. But it would be
to complicated for users to get it right.


> > After all, the proposed patch might be good enough.
> > panic_print_sys_info() does nothing by default.
> > It might be enough to document that a large enough log buffer
> > should be used when some output is enabled, especially when
> > a dumper is used.
> > 
> > Also we should mention these pitfalls and risks in the commit
> > message.
> 
> OK, makes total sense. I'll wait on your feedback to my points above,
> and maybe others could also provide ideas/concerns, and then, I'll
> rework the commit message/code accordingly.

I think that this is the best solution after all.

Well, I see one more problem. CONSOLE_REPLAY_ALL replays all messages
on the console. This flag should be handled when it is now. I mean
after kmsg_dump(), kexec(), console_flush_on_panic().

It is pity that PANIC_PRINT_ALL_PRINTK_MSG is in "panic_print" flags.
It makes sense only for consoles. All the other flags make sense
also for kmsg_dump().

A solution would be to add a new kernel parameter that would
obsolete PANIC_PRINT_ALL_PRINTK_MSG. The parameter can be
called panic_console_replay or so.


> Thanks again, and a happy new year to you and family =)

Same to you.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH] panic: Move panic_print before kmsg dumpers

2022-01-03 Thread Petr Mladek
On Thu 2021-12-30 13:18:28, Guilherme G. Piccoli wrote:
> The panic_print setting allows users to collect more information in a
> panic event, like memory stats, tasks, CPUs backtraces, etc.
> This is a pretty interesting debug mechanism, but currently the print
> event happens *after* kmsg_dump(), meaning that Pstore, for example,
> cannot collect a dmesg with the panic_print information.

I see the point. Unfortunately, there is some risk in this change
so I have to ask:

Did you miss this behavior in practice?
Or does it just look like something that might be useful?


> This patch changes that by moving the panic_print_sys_info() function
> call up some lines, in order kmsg_dump() accounts this new information
> and exposes it through Pstore or other kmsg dumpers.

Note that panic_print_sys_info() might produce a lot of lines. There is
non-trivial chance that a lot of information might get lost because
of the limited log buffer size.

It might cause regression even when no dumpers are registered and
crash kernel is not loaded. panic_print_sys_info() might overwrite
the existing messages before we try hard to flush them via
console_flush_on_panic().


> Cc: Feng Tang 
> Signed-off-by: Guilherme G. Piccoli 
> ---
> 
> Hey folks, thanks in advance for reviews/comments!
> One alternative I've considered was to move kmsg_dump() some
> lines down, I'm not sure which approach is better, can't see
> much difference. Opinions are very welcome =)

This does not look like a good idea either. The comment below
kmsg_dump(KMSG_DUMP_PANIC) says that it should be called
before crash_kexec() that might fail.


IMHO, the best solution would be to allow dumping messages produced
by panic_print_sys_info() using the registered dumpers directly.
But it might require redesigning the kmsg_dump API.


After all, the proposed patch might be good enough.
panic_print_sys_info() does nothing by default.
It might be enough to document that a large enough log buffer
should be used when some output is enabled, especially when
a dumper is used.

Also we should mention these pitfalls and risks in the commit
message.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v4 0/6] printk: remove safe buffers

2021-07-27 Thread Petr Mladek
On Thu 2021-07-15 21:39:53, John Ogness wrote:
> Hi,
> 
> Here is v4 of a series to remove the safe buffers. v3 can be
> found here [0]. The safe buffers are no longer needed because
> messages can be stored directly into the log buffer from any
> context.
> 
> However, the safe buffers also provided a form of recursion
> protection. For that reason, explicit recursion protection is
> implemented for this series.
> 
> The safe buffers also implicitly provided serialization
> between multiple CPUs executing in NMI context. This was
> particularly necessary for the nmi_backtrace() output. This
> serializiation is now preserved by using the printk cpulock.
> 
> With the removal of the safe buffers, there is no need for
> extra NMI enter/exit tracking. So this is also removed
> (which includes removing the config option CONFIG_PRINTK_NMI).
> 
> And finally, there are a few places in the kernel that need to
> specify code blocks where all printk calls are to be deferred
> printing. Previously the NMI tracking API was being (mis)used
> for this purpose. This series introduces an official and
> explicit interface for such cases. (Note that all deferred
> printing will be removed anyway, once printing kthreads are
> introduced.)
> 
> John Ogness (6):
>   lib/nmi_backtrace: explicitly serialize banner and regs
>   printk: track/limit recursion
>   printk: remove safe buffers
>   printk: remove NMI tracking
>   printk: convert @syslog_lock to mutex
>   printk: syslog: close window between wait and read

The entire patchset has been committed into printk/linux.git,
branch rework/printk_safe-removal.

Note that I have updated the 4th patch as discussed, see
https://lore.kernel.org/r/20210721120026.y3dqno24ahw4s...@pathway.suse.cz
https://lore.kernel.org/r/20210721130852.zrjnti6b3fwjg...@pathway.suse.cz

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v4 3/6] printk: remove safe buffers

2021-07-21 Thread Petr Mladek
On Thu 2021-07-15 21:39:56, John Ogness wrote:
> With @logbuf_lock removed, the high level printk functions for
> storing messages are lockless. Messages can be stored from any
> context, so there is no need for the NMI and safe buffers anymore.
> Remove the NMI and safe buffers.
> 
> Although the safe buffers are removed, the NMI and safe context
> tracking is still in place. In these contexts, store the message
> immediately but still use irq_work to defer the console printing.
> 
> Since printk recursion tracking is in place, safe context tracking
> for most of printk is not needed. Remove it. Only safe context
> tracking relating to the console and console_owner locks is left
> in place. This is because the console and console_owner locks are
> needed for the actual printing.
> 
> Signed-off-by: John Ogness 

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v3 3/6] printk: remove safe buffers

2021-06-25 Thread Petr Mladek
On Thu 2021-06-24 17:41:56, John Ogness wrote:
> I would prefer a v4 with these fixes:
> 
> - wrap @console_owner_lock with printk_safe usage
> 
> - remove unnecessary printk_safe usage from printk_safe.c
> 
> - update commit message to say that safe context tracking is left in
>   place for both the console and console_owner locks

Sounds good to me.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v3 3/6] printk: remove safe buffers

2021-06-24 Thread Petr Mladek
On Thu 2021-06-24 13:17:45, John Ogness wrote:
> With @logbuf_lock removed, the high level printk functions for
> storing messages are lockless. Messages can be stored from any
> context, so there is no need for the NMI and safe buffers anymore.
> Remove the NMI and safe buffers.
> 
> Although the safe buffers are removed, the NMI and safe context
> tracking is still in place. In these contexts, store the message
> immediately but still use irq_work to defer the console printing.
> 
> Since printk recursion tracking is in place, safe context tracking
> for most of printk is not needed. Remove it. Only safe context
> tracking relating to the console lock is left in place. This is
> because the console lock is needed for the actual printing.

Feel free to use:

Reviewed-by: Petr Mladek 

There are some comments below.

> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -1852,7 +1839,7 @@ static int console_trylock_spinning(void)
>   if (console_trylock())
>   return 1;
>  
> - printk_safe_enter_irqsave(flags);
> + local_irq_save(flags);
>  
>   raw_spin_lock(_owner_lock);

This spin_lock is in the printk() path. We must make sure that
it does not cause deadlock.

printk_safe_enter_irqsave(flags) prevented the recursion because
it deferred the console handling.

One danger might be a lockdep report triggered by
raw_spin_lock(_owner_lock) itself. But it should be safe.
lockdep is checked before the lock is actually taken
and lockdep should disable itself before printing anything.

Another danger might be any printk() called under the lock.
The code just compares and assigns values to some variables
(static, on stack) so we should be on the safe side.

Well, I would feel more comfortable if we add printk_safe_enter_irqsave()
back around the sections guarded by this lock. It can be done
in a separate patch. The code looks safe at the moment.


> @@ -2664,9 +2648,9 @@ void console_unlock(void)
>  
>   for (;;) {
>   size_t ext_len = 0;
> + int handover;
>   size_t len;
>  
> - printk_safe_enter_irqsave(flags);
>  skip:
>   if (!prb_read_valid(prb, console_seq, ))
>   break;
> @@ -2716,19 +2700,22 @@ void console_unlock(void)
>* were to occur on another CPU, it may wait for this one to
>* finish. This task can not be preempted if there is a
>* waiter waiting to take over.
> +  *
> +  * Interrupts are disabled because the hand over to a waiter
> +  * must not be interrupted until the hand over is completed
> +  * (@console_waiter is cleared).
>*/
> + local_irq_save(flags);
>   console_lock_spinning_enable();

Same here. console_lock_spinning_enable() takes console_owner_lock.
I would feel more comfortable if we added printk_safe_enter_irqsave(flags)
inside console_lock_spinning_enable() around the locked code. The code
is safe at the moment but...

>  
>   stop_critical_timings();/* don't trace print latency */
>   call_console_drivers(ext_text, ext_len, text, len);
>   start_critical_timings();
>  
> - if (console_lock_spinning_disable_and_check()) {
> - printk_safe_exit_irqrestore(flags);
> + handover = console_lock_spinning_disable_and_check();

Same here. Also console_lock_spinning_disable_and_check() takes
console_owner_lock. It looks safe at the moment but...


> + local_irq_restore(flags);
> + if (handover)
>   return;
> - }
> -
> - printk_safe_exit_irqrestore(flags);
>  
>   if (do_cond_resched)
>   cond_resched();

> --- a/kernel/printk/printk_safe.c
> +++ b/kernel/printk/printk_safe.c
> @@ -369,7 +70,10 @@ asmlinkage int vprintk(const char *fmt, va_list args)
>* Use the main logbuf even in NMI. But avoid calling console
>* drivers that might have their own locks.
>*/
> - if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) {
> + if (this_cpu_read(printk_context) &
> + (PRINTK_NMI_DIRECT_CONTEXT_MASK |
> +  PRINTK_NMI_CONTEXT_MASK |
> +  PRINTK_SAFE_CONTEXT_MASK)) {
>   unsigned long flags;
>   int len;
>  

There is the following code right below:

printk_safe_enter_irqsave(flags);
len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
printk_safe_exit_irqrestore(flags);
defer_console_output();
return len;

printk_safe_enter_irqsave(flags) 

Re: [PATCH v6 00/13] Add build ID to stacktraces

2021-05-11 Thread Petr Mladek
On Tue 2021-05-11 12:58:47, David Laight wrote:
> From: Steven Rostedt
> > Sent: 11 May 2021 13:53
> > 
> > On Tue, 11 May 2021 12:36:06 +
> > David Laight  wrote:
> > 
> > > >  x1 : ff93fef15788 x0 : ffe3622352e0
> > > >  Call trace:
> > > >   lkdtm_WARNING+0x28/0x30 [lkdtm 
> > > > ed5019fdf5e53be37cb1ba7899292d7e143b259e]
> > > >   direct_entry+0x16c/0x1b4 [lkdtm 
> > > > ed5019fdf5e53be37cb1ba7899292d7e143b259e]
> > > >   full_proxy_write+0x74/0xa4
> > >
> > > Is there any way to get it to print each module ID only once?
> > 
> > If there's a trivial way to do that, then perhaps it should be done, but for
> > now, this patch series isn't as obnoxious as the previous versions. It only
> > affects stack traces, and I'm fine with that.
> 
> True. Printing the id in the module list was horrid.
> 
> The real downside is all the extra text that will overflow the
> in-kernel buffer.
> At least it shouldn't be extra lines causing screen wrap.
> Unless the variable names are long - hi rust :-)

Note that the ID is printed only when CONFIG_STACKTRACE_BUILD_ID
is enabled. It will be used only by some distros/vendors that
use it to download the debuginfo packages.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v6 00/13] Add build ID to stacktraces

2021-05-11 Thread Petr Mladek
On Mon 2021-05-10 17:38:32, Stephen Boyd wrote:
> This series adds the kernel's build ID[1] to the stacktrace header
> printed in oops messages, warnings, etc. and the build ID for any module
> that appears in the stacktrace after the module name. The goal is to
> make the stacktrace more self-contained and descriptive by including the
> relevant build IDs in the kernel logs when something goes wrong. This
> can be used by post processing tools like script/decode_stacktrace.sh
> and kernel developers to easily locate the debug info associated with a
> kernel crash and line up what line and file things started falling apart
> at.

The entire series looks good to me.

I reviewed carefully only the 5th patch touching printk/kallsyms/module
code. I just scanned over the other patches touching kernel code
and did not notice any obvious problem. I did not check the changes
in decode_stacktrace.sh at all.

I tried to get stacktraces on x86_64 and it worked as expected.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v3 12/12] kdump: Use vmlinux_build_id to simplify

2021-04-07 Thread Petr Mladek
On Tue 2021-03-30 20:05:20, Stephen Boyd wrote:
> We can use the vmlinux_build_id array here now instead of open coding
> it. This mostly consolidates code.
> 
> Cc: Jiri Olsa 
> Cc: Alexei Starovoitov 
> Cc: Jessica Yu 
> Cc: Evan Green 
> Cc: Hsin-Yi Wang 
> Cc: Dave Young 
> Cc: Baoquan He 
> Cc: Vivek Goyal 
> Cc: 
> Signed-off-by: Stephen Boyd 
> ---
>  include/linux/crash_core.h |  6 +-
>  kernel/crash_core.c| 41 ++
>  2 files changed, 3 insertions(+), 44 deletions(-)
> 
> diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
> index 206bde8308b2..fb8ab99bb2ee 100644
> --- a/include/linux/crash_core.h
> +++ b/include/linux/crash_core.h
> @@ -39,7 +39,7 @@ phys_addr_t paddr_vmcoreinfo_note(void);
>  #define VMCOREINFO_OSRELEASE(value) \
>   vmcoreinfo_append_str("OSRELEASE=%s\n", value)
>  #define VMCOREINFO_BUILD_ID(value) \
> - vmcoreinfo_append_str("BUILD-ID=%s\n", value)
> + vmcoreinfo_append_str("BUILD-ID=%20phN\n", value)

Please, add also build check that BUILD_ID_MAX == 20.


>  #define VMCOREINFO_PAGESIZE(value) \
>   vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
>  #define VMCOREINFO_SYMBOL(name) \
> @@ -69,10 +69,6 @@ extern unsigned char *vmcoreinfo_data;
>  extern size_t vmcoreinfo_size;
>  extern u32 *vmcoreinfo_note;
>  
> -/* raw contents of kernel .notes section */
> -extern const void __start_notes __weak;
> -extern const void __stop_notes __weak;
> -
>  Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
> void *data, size_t data_len);
>  void final_note(Elf_Word *buf);
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 825284baaf46..6b560cf9f374 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -4,6 +4,7 @@
>   * Copyright (C) 2002-2004 Eric Biederman  
>   */
>  
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -378,51 +379,13 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
>  }
>  EXPORT_SYMBOL(paddr_vmcoreinfo_note);
>  
> -#define NOTES_SIZE (&__stop_notes - &__start_notes)
> -#define BUILD_ID_MAX SHA1_DIGEST_SIZE
> -#define NT_GNU_BUILD_ID 3
> -
> -struct elf_note_section {
> - struct elf_note n_hdr;
> - u8 n_data[];
> -};
> -
>  /*
>   * Add build ID from .notes section as generated by the GNU ld(1)
>   * or LLVM lld(1) --build-id option.
>   */
>  static void add_build_id_vmcoreinfo(void)
>  {
> - char build_id[BUILD_ID_MAX * 2 + 1];
> - int n_remain = NOTES_SIZE;
> -
> - while (n_remain >= sizeof(struct elf_note)) {
> - const struct elf_note_section *note_sec =
> - &__start_notes + NOTES_SIZE - n_remain;
> - const u32 n_namesz = note_sec->n_hdr.n_namesz;
> -
> - if (note_sec->n_hdr.n_type == NT_GNU_BUILD_ID &&
> - n_namesz != 0 &&
> - !strcmp((char *)_sec->n_data[0], "GNU")) {
> - if (note_sec->n_hdr.n_descsz <= BUILD_ID_MAX) {
> - const u32 n_descsz = note_sec->n_hdr.n_descsz;
> - const u8 *s = _sec->n_data[n_namesz];
> -
> - s = PTR_ALIGN(s, 4);
> - bin2hex(build_id, s, n_descsz);
> - build_id[2 * n_descsz] = '\0';
> - VMCOREINFO_BUILD_ID(build_id);
> - return;
> - }
> - pr_warn("Build ID is too large to include in 
> vmcoreinfo: %u > %u\n",
> - note_sec->n_hdr.n_descsz,
> - BUILD_ID_MAX);
> - return;
> - }
> - n_remain -= sizeof(struct elf_note) +
> - ALIGN(note_sec->n_hdr.n_namesz, 4) +
> - ALIGN(note_sec->n_hdr.n_descsz, 4);
> - }
> + VMCOREINFO_BUILD_ID(vmlinux_build_id);
>  }

The function add_build_id_vmcoreinfo() is used in
crash_save_vmcoreinfo_init() in this context:


VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
add_build_id_vmcoreinfo();
VMCOREINFO_PAGESIZE(PAGE_SIZE);

VMCOREINFO_SYMBOL(init_uts_ns);
VMCOREINFO_OFFSET(uts_namespace, name);
VMCOREINFO_SYMBOL(node_online_map);

The function is not longer need. VMCOREINFO_BUILD_ID()
can be used directly:

VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
VMCOREINFO_BUILD_ID(vmlinux_build_id);
VMCOREINFO_PAGESIZE(PAGE_SIZE);

VMCOREINFO_SYMBOL(init_uts_ns);
VMCOREINFO_OFFSET(uts_namespace, name);
VMCOREINFO_SYMBOL(node_online_map);


Best Regards,
Petr


>  
>  static int __init crash_save_vmcoreinfo_init(void)
> -- 
> https://chromeos.dev

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v2 2/5] printk: remove safe buffers

2021-04-06 Thread Petr Mladek
On Fri 2021-04-02 11:14:18, Sergey Senozhatsky wrote:
> On (21/04/01 16:17), Petr Mladek wrote:
> > > For the long term, we should introduce a printk-context API that allows
> > > callers to perfectly pack their multi-line output into a single
> > > entry. We discussed [0][1] this back in August 2020.
> > 
> > We need a "short" term solution. There are currently 3 solutions:
> > 
> > 1. Keep nmi_safe() and all the hacks around.
> > 
> > 2. Serialize nmi_cpu_backtrace() by a spin lock and later by
> >the special lock used also by atomic consoles.
> > 
> > 3. Tell complaining people how to sort the messed logs.
> 
> Are we talking about nmi_cpu_backtrace()->dump_stack() or some
> other path?

It is about serializing

if (regs)
show_regs(regs);
else
dump_stack();

in nmi_cpu_backtrace() when it is triggered on many(all) CPUs
at the same time.


> dump_stack() seems to be already serialized by `dump_lock`. Hmm,
> show_regs() is not serialized, seems like it should be under the
> same `dump_lock` as dump_stack().

Ah, I think that you already mentioned it in the past and I forget it.

Yes, we would need to synchronize all these dump/show functions using
the same lock. It is already the lock that might be taken recursively
on the same CPU.

In each case, we must not introduce another lock in
nmi_cpu_backtrace() because it might cause deadlock with
@dump_lock.

Anyway, I would really like to keep the dumps serialized. So, we
either need to use the same lock everywhere or we need to keep
nmi_safe buffers for now.

I would like to remove the nmi_safe buffers in the long term
but I am fine with doing it later after the consoles rework.
I'll leave the prioritization for John who is doing the work
and might have some preferences.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v2 2/5] printk: remove safe buffers

2021-04-01 Thread Petr Mladek
On Thu 2021-04-01 15:19:52, John Ogness wrote:
> On 2021-04-01, Petr Mladek  wrote:
> >> --- a/kernel/printk/printk.c
> >> +++ b/kernel/printk/printk.c
> >> @@ -1142,24 +1128,37 @@ void __init setup_log_buf(int early)
> >> new_descs, ilog2(new_descs_count),
> >> new_infos);
> >>  
> >> -  printk_safe_enter_irqsave(flags);
> >> +  local_irq_save(flags);
> >
> > IMHO, we actually do not have to disable IRQ here. We already copy
> > messages that might appear in the small race window in NMI. It would
> > work the same way also for IRQ context.
> 
> We do not have to, but why open up this window? We are still in early
> boot and interrupts have always been disabled here. I am not happy that
> this window even exists. I really prefer to keep it NMI-only.

Fair enough.

> >> --- a/lib/nmi_backtrace.c
> >> +++ b/lib/nmi_backtrace.c
> >> @@ -75,12 +75,6 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t 
> >> *mask,
> >>touch_softlockup_watchdog();
> >>}
> >>  
> >> -  /*
> >> -   * Force flush any remote buffers that might be stuck in IRQ context
> >> -   * and therefore could not run their irq_work.
> >> -   */
> >> -  printk_safe_flush();
> >
> > Sigh, this reminds me that the nmi_safe buffers serialized backtraces
> > from all CPUs.
> >
> > I am afraid that we have to put back the spinlock into
> > nmi_cpu_backtrace().
> 
> Please no. That spinlock is a disaster. It can cause deadlocks with
> other cpu-locks (such as in kdb)

Could you please explain more the kdb case?
I am curious what locks might depend on each other here.

> and it will cause a major problem for atomic consoles.

AFAIK, you are going to add a special lock that would allow
nesting on the same CPU. It should possible and safe
to use is also for synchronizing the backtraces here.


> We need to be very careful about introducing locks
> where NMIs are waiting on other CPUs.

I agree.


> > It has been repeatedly added and removed depending
> > on whether the backtrace was printed into the main log buffer
> > or into the per-CPU buffers. Last time it was removed by
> > the commit 03fc7f9c99c1e7ae2925d ("printk/nmi: Prevent deadlock
> > when accessing the main log buffer in NMI").
> >
> > It should be safe because there should not be any other locks in the
> > code path. Note that only one backtrace might be triggered at the same
> > time, see @backtrace_flag in nmi_trigger_cpumask_backtrace().
> 
> It is adding a lock around a lockless ringbuffer. For me that is a step
> backwards.
> 
> > We _must_ serialize it somehow[*]. The lock in nmi_cpu_backtrace()
> > looks less evil than the nmi_safe machinery. nmi_safe() shrinks
> > too long backtraces, lose timestamps, needs to be explicitely
> > flushed here and there, is a non-trivial code.
> >
> > [*] Non-serialized bactraces are real mess. Caller-id is visible
> > only on consoles or via syslogd interface. And it is not much
> > convenient.
> 
> Caller-id solves this problem and is easy to sort for anyone with
> `grep'. Yes, it is a shame that `dmesg' does not show it, but directly
> using any of the printk interfaces does show it (kmsg_dump, /dev/kmsg,
> syslog, console).

True but frankly, the current situation is _far_ from convenient:

   + consoles do not show it by default
   + none userspace tool (dmesg, journalctl, crash) is able to show it
   + grep is a nightmare, especially if you have more than handful of CPUs

Yes, everything is solvable but not easily.

> > I get this with "echo l >/proc/sysrq-trigger" and this patchset:
> 
> Of course. Without caller-id, it is a mess. But this has nothing to do
> with NMI. The same problem exists for WARN_ON() on multiple CPUs
> simultaneously. If the user is not using caller-id, they are
> lost. Caller-id is the current solution to the interlaced logs.

Sure. But in reality, the risk of mixed WARN_ONs is small. While
this patch makes backtraces from all CPUs always unusable without
caller_id and non-trivial effort.


> For the long term, we should introduce a printk-context API that allows
> callers to perfectly pack their multi-line output into a single
> entry. We discussed [0][1] this back in August 2020.

We need a "short" term solution. There are currently 3 solutions:

1. Keep nmi_safe() and all the hacks around.

2. Serialize nmi_cpu_backtrace() by a spin lock and later by
   the special lock used also by atomic consoles.

3. Tell complaining people how to sort the messed logs.


My preference:

I most prefer 2nd solution until I see a realistic scenario
of a possible deadlock with the current kernel code.

I would still prefer 1st solution over 3rd one until we improve
kernel/userspace support for sorting the log by the caller id.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v2 2/5] printk: remove safe buffers

2021-04-01 Thread Petr Mladek
On Tue 2021-03-30 17:35:09, John Ogness wrote:
> With @logbuf_lock removed, the high level printk functions for
> storing messages are lockless. Messages can be stored from any
> context, so there is no need for the NMI and safe buffers anymore.
> Remove the NMI and safe buffers.
> 
> Although the safe buffers are removed, the NMI and safe context
> tracking is still in place. In these contexts, store the message
> immediately but still use irq_work to defer the console printing.
> 
> Since printk recursion tracking is in place, safe context tracking
> for most of printk is not needed. Remove it. Only safe context
> tracking relating to the console lock is left in place. This is
> because the console lock is needed for the actual printing.

> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -1142,24 +1128,37 @@ void __init setup_log_buf(int early)
>new_descs, ilog2(new_descs_count),
>new_infos);
>  
> - printk_safe_enter_irqsave(flags);
> + local_irq_save(flags);

IMHO, we actually do not have to disable IRQ here. We already copy
messages that might appear in the small race window in NMI. It would work
the same way also for IRQ context.

>   log_buf_len = new_log_buf_len;
>   log_buf = new_log_buf;
>   new_log_buf_len = 0;
>  
>   free = __LOG_BUF_LEN;
> - prb_for_each_record(0, _rb_static, seq, )
> - free -= add_to_rb(_rb_dynamic, );
> + prb_for_each_record(0, _rb_static, seq, ) {
> + text_size = add_to_rb(_rb_dynamic, );
> + if (text_size > free)
> + free = 0;
> + else
> + free -= text_size;
> + }
>  
> - /*
> -  * This is early enough that everything is still running on the
> -  * boot CPU and interrupts are disabled. So no new messages will
> -  * appear during the transition to the dynamic buffer.
> -  */
>   prb = _rb_dynamic;
>  
> - printk_safe_exit_irqrestore(flags);
> + local_irq_restore(flags);
> +
> + /*
> +  * Copy any remaining messages that might have appeared from
> +  * NMI context after copying but before switching to the
> +  * dynamic buffer.

The above comment would need to get updated if we keep also normal
IRQ enabled when copying the log buffers.

> +  */
> + prb_for_each_record(seq, _rb_static, seq, ) {
> + text_size = add_to_rb(_rb_dynamic, );
> + if (text_size > free)
> + free = 0;
> + else
> + free -= text_size;
> + }
>  
>   if (seq != prb_next_seq(_rb_static)) {
>   pr_err("dropped %llu messages\n",

> --- a/lib/nmi_backtrace.c
> +++ b/lib/nmi_backtrace.c
> @@ -75,12 +75,6 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
>   touch_softlockup_watchdog();
>   }
>  
> - /*
> -  * Force flush any remote buffers that might be stuck in IRQ context
> -  * and therefore could not run their irq_work.
> -  */
> - printk_safe_flush();

Sigh, this reminds me that the nmi_safe buffers serialized backtraces
from all CPUs.

I am afraid that we have to put back the spinlock into
nmi_cpu_backtrace(). It has been repeatedly added and removed depending
on whether the backtrace was printed into the main log buffer
or into the per-CPU buffers. Last time it was removed by
the commit 03fc7f9c99c1e7ae2925d ("printk/nmi: Prevent deadlock
when accessing the main log buffer in NMI").

It should be safe because there should not be any other locks in the
code path. Note that only one backtrace might be triggered at the same
time, see @backtrace_flag in nmi_trigger_cpumask_backtrace().

We _must_ serialize it somehow[*]. The lock in nmi_cpu_backtrace()
looks less evil than the nmi_safe machinery. nmi_safe() shrinks
too long backtraces, lose timestamps, needs to be explicitely
flushed here and there, is a non-trivial code.

[*] Non-serialized bactraces are real mess. Caller-id is visible
only on consoles or via syslogd interface. And it is not much
convenient.

I get this with "echo l >/proc/sysrq-trigger" and this patchset:

[   95.642793] sysrq: Show backtrace of all active CPUs
[   95.645202] NMI backtrace for cpu 3
[   95.646778] CPU: 3 PID: 5095 Comm: bash Kdump: loaded Tainted: GW
 5.11.0-default+ #231
[   95.650397] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
[   95.656497] Call Trace:
[   95.657937]  dump_stack+0x88/0xab
[   95.659888]  nmi_cpu_backtrace+0xa4/0xc0
[   95.661744]  ? lapic_can_unplug_cpu+0xa0/0xa0
[   95.663658]  nmi_trigger_cpumask_backtrace+0xe6/0x120
[   95.665657]  arch_trigger_cpumask_backtrace+0x19/0x20
[   95.667720]  sysrq_handle_showallcpus+0x17/0x20
[   95.670218]  __handle_sysrq+0xe1/0x240
[   95.672190]  write_sysrq_trigger+0x51/0x60
[   95.673993]  proc_reg_write+0x62/0x90
[   95.675319]  vfs_write+0xed/0x380
[ 

Re: [PATCH v2 00/12] Add build ID to stacktraces

2021-03-30 Thread Petr Mladek
On Thu 2021-03-25 16:21:46, Stephen Boyd wrote:
> Quoting peter enderborg (2021-03-25 04:06:17)
> > On 3/24/21 9:55 AM, Christoph Hellwig wrote:
> > > On Tue, Mar 23, 2021 at 07:04:31PM -0700, Stephen Boyd wrote:
> > >>  x5 :  x4 : 0001
> > >>  x3 : 0008 x2 : ff93fef25a70
> > >>  x1 : ff93fef15788 x0 : ffe3622352e0
> > >>  Call trace:
> > >>   lkdtm_WARNING+0x28/0x30 [lkdtm 
> > >> ed5019fdf5e53be37cb1ba7899292d7e143b259e]
> > >>   direct_entry+0x16c/0x1b4 [lkdtm 
> > >> ed5019fdf5e53be37cb1ba7899292d7e143b259e]
> > > Yikes.  No, please do not make the backtraces a complete mess for
> > > something that serves absolutely no need.
> 
> It serves a need. Please look at the patches to understand that I'm
> adding the buildid to automatically find the associated debug
> information on distros.
> 
> > 
> > Would a "verbose" flag be acceptable solution?    Something like write 1 to 
> > /sys/kernel/debug/verbose_stack to get the extra info.
> > 
> > I think I see a need for it.
> > 
> 
> Or a kernel config option and a commandline parameter? That would be OK
> for me as I said on v1 of this series. I'll add that in for the next
> patch series given all the distaste for some more hex characters next to
> the module name.

IMHO, a build configure option would fit the best here.

It does not make sense to show the ID when the kernel vendor does
not have a service to download the related binaries. But it makes
sense to show the buildid by default when the provider/distro has
the service and want to use the ID when handling bug reports.

We could always add boot/run time options when people really need it.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH next v1 2/3] printk: remove safe buffers

2021-03-29 Thread Petr Mladek
On Fri 2021-03-26 12:12:37, John Ogness wrote:
> On 2021-03-23, Petr Mladek  wrote:
> >> --- a/kernel/printk/printk.c
> >> +++ b/kernel/printk/printk.c
> >> -
> >>if (seq != prb_next_seq(_rb_static)) {
> >>pr_err("dropped %llu messages\n",
> >>   prb_next_seq(_rb_static) - seq);
> >> @@ -2666,7 +2631,6 @@ void console_unlock(void)
> >>size_t ext_len = 0;
> >>size_t len;
> >>  
> >> -  printk_safe_enter_irqsave(flags);
> >>  skip:
> >>if (!prb_read_valid(prb, console_seq, ))
> >>break;
> >> @@ -2711,6 +2675,8 @@ void console_unlock(void)
> >>printk_time);
> >>console_seq++;
> >>  
> >> +  printk_safe_enter_irqsave(flags);
> >
> > What is the purpose of the printk_safe context here, please?
> 
> console_lock_spinning_enable() needs to be called with interrupts
> disabled. I should have just used local_irq_save().
> 
> I could add local_irq_save() to console_lock_spinning_enable() and
> restore them at the end of console_lock_spinning_disable_and_check(),
> but then I would need to add a @flags argument to both functions. I
> think it is simpler to just do the disable/enable from the caller,
> console_unlock().

I see. I have missed it that all this code have to be called with
interrupts disabled.

OK, it is a must-to-have because of the spinning. But I wonder if some
console drivers rely on the fact that the write() callback is
called with interrupts disabled.

IMHO, it would be a bug when any write() callback expects that
callers disabled the interrupts.

Do you plan to remove the console-spinning stuff after offloading
consoles to the kthreads?

Will you call console write() callback with irq enabled from
the kthread?

Anyway, we should at least add a comment why the interrupts are
disabled.


> BTW, I could not find any sane way of disabling interrupts via a
> raw_spin_lock_irqsave() of @console_owner_lock because of the how it is
> used with lockdep. In particular for
> console_lock_spinning_disable_and_check().

I see. IMHO, we would need to explicitly call local_irq_save()/restore()
if we moved them to console_lock_spinning_enable()/disable_and_check().
I mean to do:


static void console_lock_spinning_enable(unsigned long *flags)
{
local_irq_save(*flags);

raw_spin_lock(_owner_lock);
console_owner = current;
raw_spin_unlock(_owner_lock);

/* The waiter may spin on us after setting console_owner */
spin_acquire(_owner_dep_map, 0, 0, _THIS_IP_);
}

...

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH next v1 2/3] printk: remove safe buffers

2021-03-23 Thread Petr Mladek
On Wed 2021-03-17 00:33:25, John Ogness wrote:
> With @logbuf_lock removed, the high level printk functions for
> storing messages are lockless. Messages can be stored from any
> context, so there is no need for the NMI and safe buffers anymore.
> Remove the NMI and safe buffers.
> 
> Although the safe buffers are removed, the NMI and safe context
> tracking is still in place. In these contexts, store the message
> immediately but still use irq_work to defer the console printing.
> 
> Since printk recursion tracking is in place, safe context tracking
> for most of printk is not needed. Remove it. Only safe context
> tracking relating to the console lock is left in place. This is
> because the console lock is needed for the actual printing.

I have two more questions after actually checking the entire patch.
See below.

> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -1084,7 +1069,6 @@ void __init setup_log_buf(int early)
>   struct printk_record r;
>   size_t new_descs_size;
>   size_t new_infos_size;
> - unsigned long flags;
>   char *new_log_buf;
>   unsigned int free;
>   u64 seq;
> @@ -1142,8 +1126,6 @@ void __init setup_log_buf(int early)
>new_descs, ilog2(new_descs_count),
>new_infos);
>  
> - printk_safe_enter_irqsave(flags);
> -
>   log_buf_len = new_log_buf_len;
>   log_buf = new_log_buf;
>   new_log_buf_len = 0;
> @@ -1159,8 +1141,6 @@ void __init setup_log_buf(int early)
>*/
>   prb = _rb_dynamic;
>  
> - printk_safe_exit_irqrestore(flags);

This will allow to add new messages from the IRQ context when we
are copying them to the new buffer. They might get lost in
the small race window.

Also the messages from NMI might get lost because they are not
longer stored in the per-CPU buffer.

A possible solution might be to do something like this:

prb_for_each_record(0, _rb_static, seq, )
free -= add_to_rb(_rb_dynamic, );

prb = _rb_dynamic;

/*
 * Copy the remaining messages that might have appeared
 * from IRQ or NMI context after we ended copying and
 * before we switched the buffers. They must be finalized
 * because only one CPU is up at this stage.
 */
prb_for_each_record(seq, _rb_static, seq, )
free -= add_to_rb(_rb_dynamic, );


> -
>   if (seq != prb_next_seq(_rb_static)) {
>   pr_err("dropped %llu messages\n",
>  prb_next_seq(_rb_static) - seq);
> @@ -2666,7 +2631,6 @@ void console_unlock(void)
>   size_t ext_len = 0;
>   size_t len;
>  
> - printk_safe_enter_irqsave(flags);
>  skip:
>   if (!prb_read_valid(prb, console_seq, ))
>   break;
> @@ -2711,6 +2675,8 @@ void console_unlock(void)
>   printk_time);
>   console_seq++;
>  
> + printk_safe_enter_irqsave(flags);

What is the purpose of the printk_safe context here, please?

I guess that you wanted to prevent calling console drivers
recursively. But it is already serialized by console_lock().

IMHO, the only risk is when manipulating console_sem->lock
or console_owner_lock. But they are already guarded by
printk_safe context, for example, in console_lock() or
console_lock_spinning_enable().

Do I miss something, please?


> +
>   /*
>* While actively printing out messages, if another printk()
>* were to occur on another CPU, it may wait for this one to
> @@ -2745,8 +2711,6 @@ void console_unlock(void)
>* flush, no worries.
>*/
>   retry = prb_read_valid(prb, console_seq, NULL);
> - printk_safe_exit_irqrestore(flags);
> -
>   if (retry && console_trylock())
>   goto again;
>  }

Heh, all these patches feels like stripping printk of an armour. I hope
that we trained it enough to be flexible and avoid any damage.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH next v1 2/3] printk: remove safe buffers

2021-03-23 Thread Petr Mladek
On Mon 2021-03-22 22:58:47, John Ogness wrote:
> On 2021-03-22, Petr Mladek  wrote:
> > On Mon 2021-03-22 12:16:15, John Ogness wrote:
> >> On 2021-03-21, Sergey Senozhatsky  wrote:
> >> >> @@ -369,7 +70,10 @@ __printf(1, 0) int vprintk_func(const char *fmt, 
> >> >> va_list args)
> >> >>  * Use the main logbuf even in NMI. But avoid calling console
> >> >>  * drivers that might have their own locks.
> >> >>  */
> >> >> -   if ((this_cpu_read(printk_context) & 
> >> >> PRINTK_NMI_DIRECT_CONTEXT_MASK)) {
> >> >> +   if (this_cpu_read(printk_context) &
> >> >> +   (PRINTK_NMI_DIRECT_CONTEXT_MASK |
> >> >> +PRINTK_NMI_CONTEXT_MASK |
> >> >> +PRINTK_SAFE_CONTEXT_MASK)) {
> >> >

> >> But I suppose I could switch
> >> the 1 printk_nmi_direct_enter() user to printk_nmi_enter() so that
> >> PRINTK_NMI_DIRECT_CONTEXT_MASK can be removed now. I would do this in a
> >> 4th patch of the series.
> >
> > Yes, please unify the PRINTK_NMI_CONTEXT. One is enough.
> 
> Agreed. (But I'll go even further. See below.)
> 
> > I wonder if it would make sense to go even further at this stage.
> > What is possible?
> >
> > 1. We could get rid of printk_nmi_enter()/exit() and
> >PRINTK_NMI_CONTEXT completely already now. It is enough
> >to check in_nmi() in printk_func().
> >
> 
> Agreed. in_nmi() within vprintk_emit() is enough to detect if the
> console code should be skipped:
> 
> if (!in_sched && !in_nmi()) {
> ...
> }

Well, we also need to make sure that the irq work is scheduled to
call console later. We should keep this dicision in
printk_func(). I mean to replace the current

if (this_cpu_read(printk_context) &
(PRINTK_NMI_DIRECT_CONTEXT_MASK |
 PRINTK_NMI_CONTEXT_MASK |
 PRINTK_SAFE_CONTEXT_MASK)) {

with

/*
 * Avoid calling console drivers in recursive printk()
 * and in NMI context.
 */
if (this_cpu_read(printk_context) || in_nmi() {

That said, I am not sure how this fits your further rework.
I do not want to complicate it too much.

I am just afraid that the discussion about console rework might
take some time. And this would remove some complexity before we
started the more complicated or controversial changes.


> > 2. I thought about unifying printk_safe_enter()/exit() and
> >printk_enter()/exit(). They both count recursion with
> >IRQs disabled, have similar name. But they are used
> >different way.
> >
> >But better might be to rename printk_safe_enter()/exit() to
> >console_enter()/exit() or to printk_deferred_enter()/exit().
> >It would make more clear what it does now. And it might help
> >to better distinguish it from the new printk_enter()/exit().
> >
> >I am not sure if it is worth it.
> 
> I am also not sure if it is worth the extra "noise" just to give the
> function a more appropriate name. The plan is to remove it completely
> soon anyway. My vote is to leave the name as it is.

OK, let's keep printk_safe() name. It was just an idea. I wrote it
primary to sort my thoughts.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH next v1 2/3] printk: remove safe buffers

2021-03-22 Thread Petr Mladek
On Mon 2021-03-22 12:16:15, John Ogness wrote:
> On 2021-03-21, Sergey Senozhatsky  wrote:
> >> @@ -369,7 +70,10 @@ __printf(1, 0) int vprintk_func(const char *fmt, 
> >> va_list args)
> >> * Use the main logbuf even in NMI. But avoid calling console
> >> * drivers that might have their own locks.
> >> */
> >> -  if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) {
> >> +  if (this_cpu_read(printk_context) &
> >> +  (PRINTK_NMI_DIRECT_CONTEXT_MASK |
> >> +   PRINTK_NMI_CONTEXT_MASK |
> >> +   PRINTK_SAFE_CONTEXT_MASK)) {
> >
> > Do we need printk_nmi_direct_enter/exit() and
> > PRINTK_NMI_DIRECT_CONTEXT_MASK?  Seems like all printk_safe() paths
> > are now DIRECT - we store messages to the prb, but don't call console
> > drivers.
>
> I was planning on waiting until the kthreads are introduced, in which
> case printk_safe.c is completely removed.

You want to keep printk_safe() context because it prevents calling
consoles even in normal context. Namely, it prevents deadlock by
recursively taking, for example, sem->lock in console_lock() or
console_owner_lock in console_trylock_spinning(). Am I right?


> But I suppose I could switch
> the 1 printk_nmi_direct_enter() user to printk_nmi_enter() so that
> PRINTK_NMI_DIRECT_CONTEXT_MASK can be removed now. I would do this in a
> 4th patch of the series.

Yes, please unify the PRINTK_NMI_CONTEXT. One is enough.

I wonder if it would make sense to go even further at this stage.
There will still be 4 contexts that modify the printk behavior after
this patchset:

  + printk_count set by printk_enter()/exit()
  + prevents: infinite recursion
  + context: any context
  + action: skips entire printk at 3rd recursion level

  + prink_context set by printk_safe_enter()/exit()
  + prevents: dead lock caused by recursion into some
console code in any context
  + context: any
  + action: skips console call at 1st recursion level

  + printk_context set by printk_nmi_enter()/exit()
  + prevents: dead lock caused by any console lock recursion
  + context: NMI
  + action: skips console calls at 0th recursion level

  + kdb_trap_printk
  + redirects printk() to kdb_printk() in kdb context


What is possible?

1. We could get rid of printk_nmi_enter()/exit() and
   PRINTK_NMI_CONTEXT completely already now. It is enough
   to check in_nmi() in printk_func().

   printk_nmi_enter() was added by the commit 42a0bb3f71383b457a7db362
   ("printk/nmi: generic solution for safe printk in NMI"). It was
   really needed to modify @printk_func pointer.

   We did not remove it later when printk_function became a real
   function. The idea was to track all printk contexts in a single
   variable. But we never added kdb context.

   It might make sense to remove it now. Peter Zijstra would be happy.
   There already were some churns with tracking printk_context in NMI.
   For example, see
   https://lore.kernel.org/r/20200219150744.428764...@infradead.org

   IMHO, it does not make sense to wait until the entire console-stuff
   rework is done in this case.


2. I thought about unifying printk_safe_enter()/exit() and
   printk_enter()/exit(). They both count recursion with
   IRQs disabled, have similar name. But they are used
   different way.

   But better might be to rename printk_safe_enter()/exit() to
   console_enter()/exit() or to printk_deferred_enter()/exit().
   It would make more clear what it does now. And it might help
   to better distinguish it from the new printk_enter()/exit().

   This patchset actually splits the original printk_safe()
   functionality into two:

   + printk_count prevents infinite recursion
   + printk_deferred_enter() deffers console handling.

   I am not sure if it is worth it. But it might help people (even me)
   when digging into the printk history. Different name will help to
   understand the functionality at the given time.


What do you think, please?

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v2 0/3] printk: move dictionaries to meta data

2020-09-22 Thread Petr Mladek
On Sat 2020-09-19 00:40:18, John Ogness wrote:
> Hello,
> 
> Here is v2 for a series to move all existing dictionary
> properties (SUBSYSTEM and DEVICE) into the meta data of a
> record, thus eliminating the need for the dict ring. This
> change affects how the dictionaries are stored, but does not
> affect how they are presented to userspace. (v1 is here [0]).
> 
> John Ogness (3):
>   printk: move printk_info into separate array
>   printk: move dictionary keys to dev_printk_info
>   printk: remove dict ring

The patchset is committed in printk/linux.git, branch printk-rework.

The 4th version of the 2nd patch is used.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v4 2/3] printk: move dictionary keys to dev_printk_info

2020-09-21 Thread Petr Mladek
On Mon 2020-09-21 13:24:45, John Ogness wrote:
> Dictionaries are only used for SUBSYSTEM and DEVICE properties. The
> current implementation stores the property names each time they are
> used. This requires more space than otherwise necessary. Also,
> because the dictionary entries are currently considered optional,
> it cannot be relied upon that they are always available, even if the
> writer wanted to store them. These issues will increase should new
> dictionary properties be introduced.
> 
> Rather than storing the subsystem and device properties in the
> dict ring, introduce a struct dev_printk_info with separate fields
> to store only the property values. Embed this struct within the
> struct printk_info to provide guaranteed availability.
> 
> Signed-off-by: John Ogness 
> Reviewed-by: Petr Mladek 
> ---
>  Sorry. v3 did not include Petr's fixup correctly. @size was wrong.
>  Now it is correct.

I could confirm that the added line and the patch looks fine now.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v2 3/3] printk: remove dict ring

2020-09-21 Thread Petr Mladek
On Sat 2020-09-19 00:40:21, John Ogness wrote:
> Since there is no code that will ever store anything into the dict
> ring, remove it. If any future dictionary properties are to be
> added, these should be added to the struct printk_info.
> 
> Signed-off-by: John Ogness 

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v2 2/3] printk: move dictionary keys to dev_printk_info

2020-09-21 Thread Petr Mladek
On Sat 2020-09-19 00:40:20, John Ogness wrote:
> Dictionaries are only used for SUBSYSTEM and DEVICE properties. The
> current implementation stores the property names each time they are
> used. This requires more space than otherwise necessary. Also,
> because the dictionary entries are currently considered optional,
> it cannot be relied upon that they are always available, even if the
> writer wanted to store them. These issues will increase should new
> dictionary properties be introduced.
> 
> Rather than storing the subsystem and device properties in the
> dict ring, introduce a struct dev_printk_info with separate fields
> to store only the property values. Embed this struct within the
> struct printk_info to provide guaranteed availability.


> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -629,36 +624,43 @@ static ssize_t msg_print_ext_body(char *buf, size_t 
> size,
>   else
>   append_char(, e, c);
>   }
> - append_char(, e, '\n');
> + append_char(, e, endc);
>  
> - if (dict_len) {
> - bool line = true;
> + return p - buf;
> +}
>  
> - for (i = 0; i < dict_len; i++) {
> - unsigned char c = dict[i];
> +static ssize_t msg_add_dict_text(char *buf, size_t size,
> +  const char *key, const char *val)
> +{
> + size_t val_len = strlen(val);
> + ssize_t len;
>  
> - if (line) {
> - append_char(, e, ' ');
> - line = false;

I double checked this and found that the above code prefixed dict
values by ' ' in /dev/kmsg.

It slightly improves readability and it is handy for eventual filtering.
It would make sense to keep it.

> - }
> + if (!val_len)
> + return 0;
>  
> - if (c == '\0') {
> - append_char(, e, '\n');
> - line = true;
> - continue;
> - }
> + len = msg_add_ext_text(buf, size, key, strlen(key), '=');
> + len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n');

Slightly ugly but simple solution is:

len = msg_add_ext_text(buf, size, "", 0, ' ');  /* dict prefix */
len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '=');
len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n');

With this fix:

Reviewed-by: Petr Mladek 


Now, this is the only problem that I have found. It is not necessary
to resend the entire patchset just because of this.

It might be enough to either respin just this patch. Or I could
commit the below one on top of the patchset. Either solution works
for me.

>From dcc5dc0467c6e7d13202d98bbefb505a1db693fd Mon Sep 17 00:00:00 2001
From: Petr Mladek 
Date: Mon, 21 Sep 2020 11:45:16 +0200
Subject: [PATCH] printk: Put back dict lines prefix into /dev/kmsg

Put back prefix for dictionary lines in /dev/kmsg. They have been removed
by the commit  XXX ("printk: move dictionary keys to dev_printk_info").

Signed-off-by: Petr Mladek 
---
 kernel/printk/printk.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 77660354a7c5..1fe3d0cb2fe0 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -637,7 +637,8 @@ static ssize_t msg_add_dict_text(char *buf, size_t size,
if (!val_len)
return 0;
 
-   len = msg_add_ext_text(buf, size, key, strlen(key), '=');
+   len = msg_add_ext_text(buf, size, "", 0, ' ');  /* dict prefix */
+   len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '=');
len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n');
 
return len;
-- 
2.26.2


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v2 1/3] printk: move printk_info into separate array

2020-09-21 Thread Petr Mladek
On Sat 2020-09-19 00:40:19, John Ogness wrote:
> The majority of the size of a descriptor is taken up by meta data,
> which is often not of interest to the ringbuffer (for example,
> when performing state checks). Since descriptors are often
> temporarily stored on the stack, keeping their size minimal will
> help reduce stack pressure.
> 
> Rather than embedding the printk_info into the descriptor, create
> a separate printk_info array. The index of a descriptor in the
> descriptor array corresponds to the printk_info with the same
> index in the printk_info array. The rules for validity of a
> printk_info match the existing rules for the data blocks: the
> descriptor must be in a consistent state.
> 
> Signed-off-by: John Ogness 

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk 3/3] printk: remove dict ring

2020-09-18 Thread Petr Mladek
On Thu 2020-09-17 15:22:44, John Ogness wrote:
> Since there is no code that will ever store anything into the dict
> ring, remove it. If any future dictionary properties are to be
> added, these should be added to the struct printk_info.
> 
> Signed-off-by: John Ogness 

I like the result. It simplified the code and removed some twists.

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk 2/3] printk: move dictionary keys to dev_printk_info

2020-09-18 Thread Petr Mladek
On Fri 2020-09-18 14:32:41, Rasmus Villemoes wrote:
> On 18/09/2020 14.13, Petr Mladek wrote:
> > On Fri 2020-09-18 08:16:37, Rasmus Villemoes wrote:
> >> On 17/09/2020 15.16, John Ogness wrote:
> >>
> >>>   if (dev->class)
> >>>   subsys = dev->class->name;
> >>>   else if (dev->bus)
> >>>   subsys = dev->bus->name;
> >>>   else
> >>> - return 0;
> >>> + return;
> >>>  
> >>> - pos += snprintf(hdr + pos, hdrlen - pos, "SUBSYSTEM=%s", subsys);
> >>> - if (pos >= hdrlen)
> >>> - goto overflow;
> >>> + snprintf(dev_info->subsystem, sizeof(dev_info->subsystem), subsys);
> >>
> >> It's unlikely that subsys would contain a %, but this will be yet
> >> another place to spend brain cycles ignoring if doing static analysis.
> >> So can we not do this. Either of strXcpy() for X=s,l will do the same
> >> thing, and likely faster.
> > 
> > Good point! Better be on the safe size in a generic printk() API.
> > 
> > Well, I am afraid that this would be only small drop in a huge lake.
> > class->name and bus->name seems to be passed to %s in so many
> > *print*() calls all over the kernel code.
> 
> So what? printf("%s", some_string_that_might_contain_percent_chars) is
> not a problem.

Grr, shame on me. I have completely messed this. The combination of
Friday afternoon and noisy kids did not help me much to get it right.

> printf(some_string_that_might_contain_percent_chars) is.

I fully agree that passing unknown string as "fmt" is dangerous and
must be used carefully. It is not needed here.

> And yes, one could do
> 
>   snprintf(dev_info->subsystem, sizeof(dev_info->subsystem), "%s", subsys);
>
> but one might as well avoid the snprintf overhead and use one of the
> strX functions that have the exact same semantics (copy as much as
> there's room for, ensure nul-termination).

Yes, we should use either snprinf() with %s or strXcpy().

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk 2/3] printk: move dictionary keys to dev_printk_info

2020-09-18 Thread Petr Mladek
On Fri 2020-09-18 08:16:37, Rasmus Villemoes wrote:
> On 17/09/2020 15.16, John Ogness wrote:
> 
> > if (dev->class)
> > subsys = dev->class->name;
> > else if (dev->bus)
> > subsys = dev->bus->name;
> > else
> > -   return 0;
> > +   return;
> >  
> > -   pos += snprintf(hdr + pos, hdrlen - pos, "SUBSYSTEM=%s", subsys);
> > -   if (pos >= hdrlen)
> > -   goto overflow;
> > +   snprintf(dev_info->subsystem, sizeof(dev_info->subsystem), subsys);
> 
> It's unlikely that subsys would contain a %, but this will be yet
> another place to spend brain cycles ignoring if doing static analysis.
> So can we not do this. Either of strXcpy() for X=s,l will do the same
> thing, and likely faster.

Good point! Better be on the safe size in a generic printk() API.

Well, I am afraid that this would be only small drop in a huge lake.
class->name and bus->name seems to be passed to %s in so many
*print*() calls all over the kernel code.

IMHO, this is not the right place to prevent the problem. Dangerous
names must be prevented when a new bus, class, device is added.

Best Rergards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk 2/3] printk: move dictionary keys to dev_printk_info

2020-09-18 Thread Petr Mladek
On Thu 2020-09-17 15:22:43, John Ogness wrote:
> Dictionaries are only used for SUBSYSTEM and DEVICE properties. The
> current implementation stores the property names each time they are
> used. This requires more space than otherwise necessary. Also,
> because the dictionary entries are currently considered optional,
> it cannot be relied upon that they are always available, even if the
> writer wanted to store them. These issues will increase should new
> dictionary properties be introduced.
> 
> Rather than storing the subsystem and device properties in the
> dict ring, introduce a struct dev_printk_info with separate fields
> to store only the property values. Embed this struct within the
> struct printk_info to provide guaranteed availability.
> 
> Signed-off-by: John Ogness 

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk 1/3] printk: move printk_info into separate array

2020-09-18 Thread Petr Mladek
On Thu 2020-09-17 15:22:42, John Ogness wrote:
> The majority of the size of a descriptor is taken up by meta data,
> which is often not of interest to the ringbuffer (for example,
> when performing state checks). Since descriptors are often
> temporarily stored on the stack, keeping their size minimal will
> help reduce stack pressure.
> 
> Rather than embedding the printk_info into the descriptor, create
> a separate printk_info array. The index of a descriptor in the
> descriptor array corresponds to the printk_info with the same
> index in the printk_info array. The rules for validity of a
> printk_info match the existing rules for the data blocks: the
> descriptor must be in a consistent state.

I like this approach.

IMHO, it better separates dict_ring- and printk-specific metadata.

The three printk-specific values (seq, caller_id, len) are still
accessed by the ring buffer code. It is not ideal. But it helps
to hide tricky operations in the ring buffer API, keep the code
more safe and sane. These exceptions are actually better
visible now.

> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -1097,6 +1097,7 @@ static char setup_dict_buf[CONSOLE_EXT_LOG_MAX] 
> __initdata;
>  
>  void __init setup_log_buf(int early)
>  {
> + struct printk_info *new_infos;
>   unsigned int new_descs_count;
>   struct prb_desc *new_descs;
>   struct printk_info info;
> @@ -1156,6 +1157,17 @@ void __init setup_log_buf(int early)
>   return;
>   }
>  
> + new_descs_size = new_descs_count * sizeof(struct printk_info);

Must be stored into new variable, e.g.  new_infos_size.=

> + new_infos = memblock_alloc(new_descs_size, LOG_ALIGN);
> + if (unlikely(!new_infos)) {
> + pr_err("log_buf_len: %zu info bytes not available\n",
> +new_descs_size);
> + memblock_free(__pa(new_descs), new_log_buf_len);
> + memblock_free(__pa(new_dict_buf), new_log_buf_len);

The above two calls have wrong size.

The same problem is there also in the error path when new_descs
allocation fail. It might be better to handle this using some
goto err_* tagrets.

Please, fix the old problem in a separate patch.

> + memblock_free(__pa(new_log_buf), new_log_buf_len);
> + return;
> + }
> +
>   prb_rec_init_rd(, ,
>   _text_buf[0], sizeof(setup_text_buf),
>   _dict_buf[0], sizeof(setup_dict_buf));


> --- a/kernel/printk/printk_ringbuffer.c
> +++ b/kernel/printk/printk_ringbuffer.c
> @@ -1726,12 +1762,12 @@ static bool copy_data(struct prb_data_ring *data_ring,
>   /*
>* Actual cannot be less than expected. It can be more than expected
>* because of the trailing alignment padding.
> +  *
> +  * Note that invalid @len values can occur because the caller loads
> +  * the value during an allowed data race.

I hope that this will not bite us in the future. The fact is that
copying the entire struct printk_info in get_desc() is ugly and
copy_data() has to be careful anyway.

>*/
> - if (WARN_ON_ONCE(data_size < (unsigned int)len)) {
> - pr_warn_once("wrong data size (%u, expecting >=%hu) for data: 
> %.*s\n",
> -  data_size, len, data_size, data);
> + if (data_size < (unsigned int)len)


>   return false;
> - }
>  
>   /* Caller interested in the line count? */
>   if (line_count)

Otherwise, I like this patch.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v5 0/6] printk: reimplement LOG_CONT handling

2020-09-15 Thread Petr Mladek
On Mon 2020-09-14 14:39:48, John Ogness wrote:
> Hello,
> 
> Here is v5 for the second series to rework the printk subsystem.
> (The v4 is here [0].) This series implements a new ringbuffer
> feature that allows the last record to be extended. Petr Mladek
> provided the initial proof of concept [1] for this.
> 
> Using the record extension feature, LOG_CONT is re-implemented
> in a way that exactly preserves its behavior, but avoids the
> need for an extra buffer. In particular, it avoids the need for
> any synchronization that such a buffer requires.

The series is ready for linux-next from my POV. I am going to push
it there tomorrow if nobody complains.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v5 5/6] printk: ringbuffer: add finalization/extension support

2020-09-15 Thread Petr Mladek
On Mon 2020-09-14 14:39:53, John Ogness wrote:
> Add support for extending the newest data block. For this, introduce
> a new finalization state (desc_finalized) denoting a committed
> descriptor that cannot be extended.
> 
> Signed-off-by: John Ogness 

Looks good to me:

Reviewed-by: Petr Mladek 

There seems to be possible a small clean up, see below. But I would do
it in a followup patch to avoid yet another respin.


> diff --git a/kernel/printk/printk_ringbuffer.c 
> b/kernel/printk/printk_ringbuffer.c
> index 911fbe150e9a..4e526c79f89c 100644
> --- a/kernel/printk/printk_ringbuffer.c
> +++ b/kernel/printk/printk_ringbuffer.c
> +/*
> + * Try to resize an existing data block associated with the descriptor
> + * specified by @id. If the resized data block should become wrapped, it
> + * copies the old data to the new data block. If @size yields a data block
> + * with the same or less size, the data block is left as is.
> + *
> + * Fail if this is not the last allocated data block or if there is not
> + * enough space or it is not possible make enough space.
> + *
> + * Return a pointer to the beginning of the entire data buffer or NULL on
> + * failure.
> + */
> +static char *data_realloc(struct printk_ringbuffer *rb,
> +   struct prb_data_ring *data_ring, unsigned int size,
> +   struct prb_data_blk_lpos *blk_lpos, unsigned long id)
> +{
> + struct prb_data_block *blk;
> + unsigned long head_lpos;
> + unsigned long next_lpos;
> + bool wrapped;
> +
> + /* Reallocation only works if @blk_lpos is the newest data block. */
> + head_lpos = atomic_long_read(_ring->head_lpos);
> + if (head_lpos != blk_lpos->next)
> + return NULL;
> +
> + /* Keep track if @blk_lpos was a wrapping data block. */
> + wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != 
> DATA_WRAPS(data_ring, blk_lpos->next));
> +
> + size = to_blk_size(size);
> +
> + next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size);
> +
> + /* If the data block does not increase, there is nothing to do. */
> + if (head_lpos - next_lpos < DATA_SIZE(data_ring)) {
> + blk = to_block(data_ring, blk_lpos->begin);
> + return >data[0];
> + }
> +
> + if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring)))
> + return NULL;
> +
> + /* The memory barrier involvement is the same as data_alloc:A. */
> + if (!atomic_long_try_cmpxchg(_ring->head_lpos, _lpos,
> +  next_lpos)) { /* LMM(data_realloc:A) */
> + return NULL;
> + }
> +
> + blk = to_block(data_ring, blk_lpos->begin);
> +
> + if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, 
> next_lpos)) {
> + struct prb_data_block *old_blk = blk;
> +
> + /* Wrapping data blocks store their data at the beginning. */
> + blk = to_block(data_ring, 0);
> +
> + /*
> +  * Store the ID on the wrapped block for consistency.
> +  * The printk_ringbuffer does not actually use it.
> +  */
> + blk->id = id;

Small cleanup: The "id" should already be there when the block has
already been wrapped before. By other words, even the above
need to be done only when (!wrapped).

> +
> + if (!wrapped) {
> + /*
> +  * Since the allocated space is now in the newly
> +  * created wrapping data block, copy the content
> +  * from the old data block.
> +  */
> + memcpy(>data[0], _blk->data[0],
> +(blk_lpos->next - blk_lpos->begin) - 
> sizeof(blk->id));
> + }
> + }
> +
> + blk_lpos->next = next_lpos;
> +
> + return >data[0];
> +}
> +
>  /* Return the number of bytes used by a data block. */
>  static unsigned int space_used(struct prb_data_ring *data_ring,
>  struct prb_data_blk_lpos *blk_lpos)

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v4 5/6] printk: ringbuffer: add finalization/extension support

2020-09-09 Thread Petr Mladek
On Tue 2020-09-08 22:34:58, John Ogness wrote:
> Add support for extending the newest data block. For this, introduce
> a new finalization state (desc_finalized) denoting a committed
> descriptor that cannot be extended.
> 
> Until a record is finalized, a writer can reopen that record to
> append new data. Reopening a record means transitioning from the
> desc_committed state back to the desc_reserved state.
> 
> A writer can explicitly finalize a record if there is no intention
> of extending it. Also, records are automatically finalized when a
> new record is reserved. This relieves writers of needing to
> explicitly finalize while also making such records available to
> readers sooner. (Readers can only traverse finalized records.)
> 
> Four new memory barrier pairs are introduced. Two of them are
> insignificant additions (data_realloc:A/desc_read:D and
> data_realloc:A/data_push_tail:B) because they are alternate path
> memory barriers that exactly match the purpose, pairing, and
> context of the two existing memory barrier pairs they provide an
> alternate path for. The other two new memory barrier pairs are
> significant additions:
> 
> desc_reopen_last:A / _prb_commit:B - When reopening a descriptor,
> ensure the state transitions back to desc_reserved before
> fully trusting the descriptor data.
> 
> _prb_commit:B / desc_reserve:D - When committing a descriptor,
> ensure the state transitions to desc_committed before checking
> the head ID to see if the descriptor needs to be finalized.
> 
> Signed-off-by: John Ogness 
> ---
>  Documentation/admin-guide/kdump/gdbmacros.txt |   3 +-
>  kernel/printk/printk_ringbuffer.c | 541 --
>  kernel/printk/printk_ringbuffer.h |   6 +-
>  scripts/gdb/linux/dmesg.py|   3 +-
>  4 files changed, 491 insertions(+), 62 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kdump/gdbmacros.txt 
> b/Documentation/admin-guide/kdump/gdbmacros.txt
> index 8f533b751c46..94fabb165abf 100644
> --- a/Documentation/admin-guide/kdump/gdbmacros.txt
> +++ b/Documentation/admin-guide/kdump/gdbmacros.txt
> @@ -297,6 +297,7 @@ end
>  define dmesg
>   # definitions from kernel/printk/printk_ringbuffer.h
>   set var $desc_committed = 1
> + set var $desc_finalized = 2
>   set var $desc_sv_bits = sizeof(long) * 8
>   set var $desc_flags_shift = $desc_sv_bits - 2
>   set var $desc_flags_mask = 3 << $desc_flags_shift
> @@ -313,7 +314,7 @@ define dmesg
>  
>   # skip non-committed record
>   set var $state = 3 & ($desc->state_var.counter >> 
> $desc_flags_shift)
> - if ($state == $desc_committed)
> + if ($state == $desc_committed || $state == $desc_finalized)
>   dump_record $desc $prev_flags
>   set var $prev_flags = $desc->info.flags
>   end
> diff --git a/kernel/printk/printk_ringbuffer.c 
> b/kernel/printk/printk_ringbuffer.c
> index 911fbe150e9a..f1fab8c82819 100644
> --- a/kernel/printk/printk_ringbuffer.c
> +++ b/kernel/printk/printk_ringbuffer.c
> @@ -360,8 +423,8 @@ static enum desc_state get_desc_state(unsigned long id,
>  
>  /*
>   * Get a copy of a specified descriptor and its queried state. A descriptor
> - * that is not in the committed or reusable state must be considered garbage
> - * by the reader.
> + * that is not in the finalized or reusable state must be considered garbage
> + * by readers.

There is one exception. The entire descriptor is read also in
committed state. The content is considered valid when the stored
desc.info.caller_id matches @caller_id of the reader. It is
used when the descriptor gets reopened.

Well, it seems that only desc_out->caller_id value is used
in desc_reopen_last(). All the other values are accessed
directly when the descriptor is successfully reopened.

We should mention this exception in the comment at least.

>   */
>  static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
>unsigned long id, struct prb_desc *desc_out)
> @@ -374,22 +437,29 @@ static enum desc_state desc_read(struct prb_desc_ring 
> *desc_ring,
>   /* Check the descriptor state. */
>   state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */
>   d_state = get_desc_state(id, state_val);
> - if (d_state != desc_committed && d_state != desc_reusable)
> + if (d_state == desc_miss || d_state == desc_reserved) {
> + /*
> +  * The descriptor is in an inconsistent state. Set at least
> +  * @state_var so that the caller can see the details of
> +  * the inconsistent state.
> +  */
> + atomic_long_set(_out->state_var, state_val);

This change looks unrelated. If I get it correctly, the value is
needed in desc_push_tail() to check whether the descriptor still
contains ID from the previous wrap. It fixes an older bug.
And it is a great 

Re: [PATCH printk v4 4/6] printk: ringbuffer: change representation of states

2020-09-09 Thread Petr Mladek
On Tue 2020-09-08 22:34:57, John Ogness wrote:
> Rather than deriving the state by evaluating bits within the flags
> area of the state variable, assign the states explicit values and
> set those values in the flags area. Introduce macros to make it
> simple to read and write state values for the state variable.
> 
> Although the functionality is preserved, the binary representation
> for the states is changed.
> 
> Signed-off-by: John Ogness 

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH printk v4 3/6] printk: ringbuffer: clear initial reserved fields

2020-09-09 Thread Petr Mladek
On Tue 2020-09-08 22:34:56, John Ogness wrote:
> prb_reserve() will set some meta data values and leave others
> uninitialized (or rather, containing the values of the previous
> wrap). Simplify the API by always clearing out all the fields.
> Only the sequence number is filled in. The caller is now
> responsible for filling in the rest of the meta data fields.
> In particular, for correctly filling in text and dict lengths.
> 
> Signed-off-by: John Ogness 

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH next v3 7/8] printk: reimplement log_cont using record extension

2020-09-02 Thread Petr Mladek
On Mon 2020-08-31 03:16:57, John Ogness wrote:
> Use the record extending feature of the ringbuffer to implement
> continuous messages. This preserves the existing continuous message
> behavior.
> 
> Signed-off-by: John Ogness 
> ---
>  kernel/printk/printk.c | 98 +-
>  1 file changed, 20 insertions(+), 78 deletions(-)

It looks simple from the printk() side ;-)

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: state names: vas: Re: [PATCH next v3 6/8] printk: ringbuffer: add finalization/extension support

2020-09-02 Thread Petr Mladek
On Wed 2020-09-02 13:26:14, John Ogness wrote:
> On 2020-09-02, Petr Mladek  wrote:
> >> +static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring,
> >> +   u32 caller_id, unsigned long *id_out)
> >> +{
> >> +  unsigned long prev_state_val;
> >> +  enum desc_state d_state;
> >> +  struct prb_desc desc;
> >> +  struct prb_desc *d;
> >> +  unsigned long id;
> >> +
> >> +  id = atomic_long_read(_ring->head_id);
> >> +
> >> +  /*
> >> +   * To minimize unnecessarily reopening a descriptor, first check the
> >> +   * descriptor is in the correct state and has a matching caller ID.
> >> +   */
> >> +  d_state = desc_read(desc_ring, id, );
> >> +  if (d_state != desc_reserved ||
> >> +  !(atomic_long_read(_var) & DESC_COMMIT_MASK) ||
> >
> > First, define 5 desc_states, something like:
> >
> > enum desc_state {
> > desc_miss = -1, /* ID mismatch */
> > desc_modified =  0x0,   /* reserved, being modified by writer */
> 
> I prefer the "desc_reserved" name. It may or may not have be modified yet.

Yeah, "desc_reserved" sounds better. I probably just wanted to free my
fantasy from the current code ;-)


> > desc_committed = 0x1,   /* committed by writer, could get reopened */
> > desc_finalized = 0x2,   /* committed, could not longer get modified */
> > desc_reusable =  0x3,   /* free, not yet used by any writer */
> > };
> >
> > Second, only 4 variants of the 3 state bits are currently used.
> > It means that two bits are enough and they might use exactly
> > the above names:
> >
> > I mean to do something like:
> >
> > #define DESC_SV_BITS(sizeof(unsigned long) * 8)
> > #define DESC_SV(desc_state) ((unsigned long)desc_state << (DESC_SV_BITS - 
> > 2))
> > #define DESC_ST(state_val)  ((unsigned long)state_val >> (DESC_SV_BITS - 2))
> 
> This makes sense and will get us back the bit we lost because of
> finalization.

Yup. Which is good especially on 32-bit architectures.

> I am wondering if VMCOREINFO should include a DESC_FLAGS_MASK so that
> crash tools could at least successfully iterate the ID's, even if they
> didn't know what all the flag values mean (in the case that more bits
> are added later).

Good point. I am just not sure whether they should try read all ids
or they should refuse reading anything when a new bit is added.

Well, I really hope that we will not need new states anytime soon.
It would need a really strong reason.

I personally can't think about any use case. pr_cont() was special
because it was the writer side. All other steps of the printk rework
are on the reader side.

I believe that we are getting close with all the ring buffer code.
And I have good feeling about it.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


misc: was: [PATCH next v3 6/8] printk: ringbuffer: add finalization/extension support

2020-09-02 Thread Petr Mladek
On Mon 2020-08-31 03:16:56, John Ogness wrote:
> Add support for extending the newest data block. For this, introduce
> a new finalization state flag (DESC_FINAL_MASK) that denotes when a
> descriptor may not be extended, i.e. is finalized.
> 
> Three new memory barrier pairs are introduced. Two of them are not
> significant because they are alternate path memory barriers that
> exactly correspond to existing memory barriers.
>
> But the third (_prb_commit:B / desc_reserve:D) is new and guarantees
> that descriptors will always be finalized, either because a
> descriptor setting DESC_COMMIT_MASK sees that there is a newer
> descriptor and so finalizes itself or because a new descriptor being
> reserved sees that the previous descriptor has DESC_COMMIT_MASK set
> and finalizes that descriptor.

Just for record. All the barriers look good to me.

There are always full barriers when we change DESC_COMMIT_MASK.
It guarantees consistency of the descriptor and data rings.

The only relaxed modification is adding DESC_FINAL_MASK. It is OK.
It can be done only when DESC_COMMIT_MASK has been set before
using a full barrier. It means that all changes of the rings
has already been since before.


> --- a/kernel/printk/printk_ringbuffer.c
> +++ b/kernel/printk/printk_ringbuffer.c
> +
> +/**
> + * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer
> + * used by the newest record.
> + *
> + * @e: The entry structure to setup.
> + * @rb:The ringbuffer to re-reserve and extend data in.
> + * @r: The record structure to allocate buffers for.
> + * @caller_id: The caller ID of the caller (reserving writer).
> + *
> + * This is the public function available to writers to re-reserve and extend
> + * data.
> + *
> + * The writer specifies the text size to extend (not the new total size) by
> + * setting the @text_buf_size field of @r. Dictionaries cannot be extended so

Better might be to say that extending dictionaries is not
supported.

> + * @dict_buf_size of @r should be set to 0. To ensure proper initialization 
> of
> + * @r, prb_rec_init_wr() should be used.
> + *
> + * This function will fail if @caller_id does not match the caller ID of the
> + * newest record. In that case the caller must reserve new data using
> + * prb_reserve().
> + *
> + * Context: Any context. Disables local interrupts on success.
> + * Return: true if text data could be extended, otherwise false.
> + *
> + * On success:
> + *
> + *   - @r->text_buf points to the beginning of the entire text buffer.
> + *
> + *   - @r->text_buf_len is set to the new total size of the buffer.

s/buf_len/buf_size/

> + *   - @r->dict_buf and @r->dict_buf_len are cleared because extending
> + * the dict buffer is not supported.

Same here.

> + *
> + *   - @r->info is not touched so that @r->info->text_len could be used
> + * to append the text.
> + *
> + *   - prb_record_text_space() can be used on @e to query the new
> + * actually used space.
> + *
> + * Important: All @r->info fields will already be set with the current values
> + *for the record. I.e. @r->info->text_len will be less than
> + *@text_buf_size and @r->info->dict_len may be set, even though
> + *@dict_buf_size is 0. Writers can use @r->info->text_len to know
> + *where concatenation begins and writers should update
> + *@r->info->text_len after concatenating.
> + */
> +bool prb_reserve_in_last(struct prb_reserved_entry *e, struct 
> printk_ringbuffer *rb,
> +  struct printk_record *r, u32 caller_id)
> +{
> + unsigned int data_size;
> + struct prb_desc *d;
> + unsigned long id;
> +
> + local_irq_save(e->irqflags);
> +
> + /* Transition the newest descriptor back to the reserved state. */
> + d = desc_reopen_last(>desc_ring, caller_id, );
> + if (!d) {
> + local_irq_restore(e->irqflags);
> + goto fail_reopen;
> + }
> +
> + /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */
> +
> + /*
> +  * Set the @e fields here so that prb_commit() can be used if
> +  * anything fails from now on.
> +  */
> + e->rb = rb;
> + e->id = id;
> +
> + /*
> +  * desc_reopen_last() checked the caller_id, but there was no
> +  * exclusive access at that point. The descriptor may have
> +  * changed since then.
> +  */
> + if (caller_id != d->info.caller_id)
> + goto fail;
> +
> + if (BLK_DATALESS(>text_blk_lpos)) {

We do not clear d->info when reopening the descriptor. I would at
least add here a consistency check similar to the one below:

if (WARN_ON_ONCE(d->info.text_len)) {
pr_warn_once("wrong data size (%u, expecting 0) for 
data\n",
 d->info.text_len);
d->info.text_len = 0;
}

> + 

Re: [PATCH next v3 6/8] printk: ringbuffer: add finalization/extension support

2020-09-02 Thread Petr Mladek
On Mon 2020-08-31 03:16:56, John Ogness wrote:
> Add support for extending the newest data block. For this, introduce
> a new finalization state flag (DESC_FINAL_MASK) that denotes when a
> descriptor may not be extended, i.e. is finalized.
> 
> --- a/kernel/printk/printk_ringbuffer.c
> +++ b/kernel/printk/printk_ringbuffer.c
> +/*
> + * Try to resize an existing data block associated with the descriptor
> + * specified by @id. If the resized datablock should become wrapped, it
> + * copies the old data to the new data block.
> + *
> + * Fail if this is not the last allocated data block or if there is not
> + * enough space or it is not possible make enough space.
> + *
> + * Return a pointer to the beginning of the entire data buffer or NULL on
> + * failure.
> + */
> +static char *data_realloc(struct printk_ringbuffer *rb,
> +   struct prb_data_ring *data_ring, unsigned int size,
> +   struct prb_data_blk_lpos *blk_lpos, unsigned long id)
> +{
> + struct prb_data_block *blk;
> + unsigned long head_lpos;
> + unsigned long next_lpos;
> + bool wrapped;
> +
> + /* Reallocation only works if @blk_lpos is the newest data block. */
> + head_lpos = atomic_long_read(_ring->head_lpos);
> + if (head_lpos != blk_lpos->next)
> + return NULL;
> +
> + /* Keep track if @blk_lpos was a wrapping data block. */
> + wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != 
> DATA_WRAPS(data_ring, blk_lpos->next));
> +
> + size = to_blk_size(size);
> +
> + next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size);
> +
> + /* If the data block does not increase, there is nothing to do. */
> + if (next_lpos == head_lpos) {
> + blk = to_block(data_ring, blk_lpos->begin);
> + return >data[0];
> + }

We might be here even when the data are shrinked but the code below
is not fully ready for this.

> + if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring)))
> + return NULL;
> +
> + /* The memory barrier involvement is the same as data_alloc:A. */
> + if (!atomic_long_try_cmpxchg(_ring->head_lpos, _lpos,
> +  next_lpos)) { /* LMM(data_realloc:A) */
> + return NULL;
> + }
> +
> + blk = to_block(data_ring, blk_lpos->begin);
> +
> + if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, 
> next_lpos)) {
> + struct prb_data_block *old_blk = blk;
> +
> + /* Wrapping data blocks store their data at the beginning. */
> + blk = to_block(data_ring, 0);
> +
> + /*
> +  * Store the ID on the wrapped block for consistency.
> +  * The printk_ringbuffer does not actually use it.
> +  */
> + blk->id = id;
> +
> + if (!wrapped) {
> + /*
> +  * Since the allocated space is now in the newly
> +  * created wrapping data block, copy the content
> +  * from the old data block.
> +  */
> + memcpy(>data[0], _blk->data[0],
> +(blk_lpos->next - blk_lpos->begin) - 
> sizeof(blk->id));

It took me quite some time to check whether this code is correct or not.

First, I wondered whether the size was correctly calculated.
It is because the original block was not wrapped, so
lpos->next - lpos->beging defines the real data buffer size.

Second, I wondered whether the target block might be smaller
than the original (the above check allows shrinking). It can't
be smaller because then the new block won't be wrapped as well.

Sigh, it is a bit tricky. And there is 3rd possibility that
is not handled. The original block might be wrapped but
the new shrunken one might not longer be wrapped.
Then we would need to copy the data the other way.


I know that this function is not currently used for shrinking.
But I would prefer to be on the safe side. Either make
the copying generic, e.g. by calculating the real data size
using the code from get_data(). Or simply comletely
refuse shrinking by the above check.

Best Regards,
Petr

> + }
> + }
> +
> + blk_lpos->next = next_lpos;
> +
> + return >data[0];
> +}
> +
>  /* Return the number of bytes used by a data block. */
>  static unsigned int space_used(struct prb_data_ring *data_ring,
>  struct prb_data_blk_lpos *blk_lpos)

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


state names: vas: Re: [PATCH next v3 6/8] printk: ringbuffer: add finalization/extension support

2020-09-02 Thread Petr Mladek
On Mon 2020-08-31 03:16:56, John Ogness wrote:
> Add support for extending the newest data block. For this, introduce
> a new finalization state flag (DESC_FINAL_MASK) that denotes when a
> descriptor may not be extended, i.e. is finalized.
 
> --- a/kernel/printk/printk_ringbuffer.c
> +++ b/kernel/printk/printk_ringbuffer.c
> @@ -49,14 +49,16 @@
>   * Descriptors have three states:
>   *
>   *   reserved
> - * A writer is modifying the record.
> + * A writer is modifying the record. Internally represented as either "0"
> + * or "DESC_COMMIT_MASK".

We should explain the difference between the two values. It might be
enough to add something like:

See "Descriptor Finalization" section for more details."

> @@ -79,6 +81,25 @@
>   * committed or reusable queried state. This makes it possible that a valid
>   * sequence number of the tail is always available.
>   *
> + * Descriptor Finalization
> + * ~~~
> + * When a writer calls the commit function prb_commit(), the record may still
> + * continue to be in the reserved queried state. In order for that record to
> + * enter into the committed queried state, that record also must be 
> finalized.
> + * A record can be finalized by three different scenarios:
> + *
> + *   1) A writer can finalize its record immediately by calling
> + *  prb_final_commit() instead of prb_commit().
> + *
> + *   2) When a new record is reserved and the previous record has been
> + *  committed via prb_commit(), that previous record is finalized.
> + *
> + *   3) When a record is committed via prb_commit() and a newer record
> + *  already exists, the record being committed is finalized.
> + *
> + * Until a record is finalized (represented by "DESC_FINAL_MASK"), a writer
> + * may "reopen" that record and extend it with more data.
> + *
>   * Data Rings
>   * ~~
>   * The two data rings (text and dictionary) function identically. They exist

[...]

> +/*
> + * Attempt to remove the commit flag so that the record can be modified by a
> + * writer again. This is only possible if the descriptor is not yet 
> finalized.
> + *
> + * Note that on success, the queried state did not change. A non-finalized
> + * record (even with the commit flag set) is in the reserved queried state.
> + */
> +static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring,
> +  u32 caller_id, unsigned long *id_out)
> +{
> + unsigned long prev_state_val;
> + enum desc_state d_state;
> + struct prb_desc desc;
> + struct prb_desc *d;
> + unsigned long id;
> +
> + id = atomic_long_read(_ring->head_id);
> +
> + /*
> +  * To minimize unnecessarily reopening a descriptor, first check the
> +  * descriptor is in the correct state and has a matching caller ID.
> +  */
> + d_state = desc_read(desc_ring, id, );
> + if (d_state != desc_reserved ||
> + !(atomic_long_read(_var) & DESC_COMMIT_MASK) ||

This looks like a hack. And similar extra check of the bit is needed
also in desc_read(), see
https://lore.kernel.org/r/878sdvq8kd@jogness.linutronix.de

I has been actually getting less and less happy with the inconsistency
between names of the bits and states.

Sigh, you will hate me because this would mean a bigger change.
IMHO, it would be much cleaner and help with long-term maintainability
when we do the following two changes:

First, define 5 desc_states, something like:

enum desc_state {
desc_miss = -1, /* ID mismatch */
desc_modified =  0x0,   /* reserved, being modified by writer */
desc_committed = 0x1,   /* committed by writer, could get reopened */
desc_finalized = 0x2,   /* committed, could not longer get modified */
desc_reusable =  0x3,   /* free, not yet used by any writer */
};

Second, only 4 variants of the 3 state bits are currently used.
It means that two bits are enough and they might use exactly
the above names:

I mean to do something like:

#define DESC_SV_BITS(sizeof(unsigned long) * 8)
#define DESC_SV(desc_state) ((unsigned long)desc_state << (DESC_SV_BITS - 
2))
#define DESC_ST(state_val)  ((unsigned long)state_val >> (DESC_SV_BITS - 2))


Then we could have:

static enum desc_state get_desc_state(unsigned long id,
  unsigned long state_val)
{
if (id != DESC_ID(state_val))
return desc_miss;

return DESC_ST(state_val);
}

and use in the code:

unsigned long val_committed = id | DESC_SV(desc_committed);


or do

#define DESC_SV(id, desc_state) (id | (unsigned long)desc_state << 
(DESC_SV_BITS - 2))

and then use DESC_SV(id, DESC_COMMITTED).


I am sorry that I did not came up with this earlier. I know how
painful it is to rework bigger patchsets. But it affects format
of the ring buffer, so we should do it early.

Best Regards,
Petr

PS: I am still middle of review. It looks good so far. I wanted to

Re: [PATCH next v3 5/8] printk: ringbuffer: clear initial reserved fields

2020-09-01 Thread Petr Mladek
On Mon 2020-08-31 03:16:55, John Ogness wrote:
> prb_reserve() will set some meta data values and leave others
> uninitialized (or rather, containing the values of the previous
> wrap). Simplify the API by always clearing out all the fields.
> Only the sequence number is filled in. The caller is now
> responsible for filling in the rest of the meta data fields.
> In particular, for correctly filling in text and dict lengths.
> 
> Signed-off-by: John Ogness 
>
> ---
>  kernel/printk/printk.c|  7 ++-
>  kernel/printk/printk_ringbuffer.c | 29 +++--
>  2 files changed, 25 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> index ad8d1dfe5fbe..7e7d596c8878 100644
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -520,8 +520,11 @@ static int log_store(u32 caller_id, int facility, int 
> level,
>   memcpy(_buf[0], text, text_len);
>   if (trunc_msg_len)
>   memcpy(_buf[text_len], trunc_msg, trunc_msg_len);
> - if (r.dict_buf)
> + r.info->text_len = text_len + trunc_msg_len;
> + if (r.dict_buf) {
>   memcpy(_buf[0], dict, dict_len);
> + r.info->dict_len = dict_len;
> + }
>   r.info->facility = facility;
>   r.info->level = level & 7;
>   r.info->flags = flags & 0x1f;
> @@ -1078,9 +1081,11 @@ static unsigned int __init add_to_rb(struct 
> printk_ringbuffer *rb,
>   return 0;
>  
>   memcpy(_r.text_buf[0], >text_buf[0], dest_r.text_buf_size);
> + dest_r.info->text_len = r->info->text_len;

It looks a bit non-consistent that memcpy() copies text_buf_size but valid data
are defined by text_len.

I would prefer to be on the safe size and copy only text_len.

I could imagine some later optimization that will make buf_size bigger
by alignment or something like that. The reason might be preparation
for continuous lines or so. Then the size of the original buffer and
the destination one might differ.

It is unrelated to this patch. I would solve it by a preparation
or followup one.


>   if (dest_r.dict_buf) {

>   memcpy(_r.dict_buf[0], >dict_buf[0],
>  dest_r.dict_buf_size);
> + dest_r.info->dict_len = r->info->dict_len;

Same here.

>   }
>   dest_r.info->facility = r->info->facility;
>   dest_r.info->level = r->info->level;
> diff --git a/kernel/printk/printk_ringbuffer.c 
> b/kernel/printk/printk_ringbuffer.c
> index d66718e74aae..da54d4fadf96 100644
> --- a/kernel/printk/printk_ringbuffer.c
> +++ b/kernel/printk/printk_ringbuffer.c
> @@ -1159,6 +1162,18 @@ bool prb_reserve(struct prb_reserved_entry *e, struct 
> printk_ringbuffer *rb,
>  
>   d = to_desc(desc_ring, id);
>  
> + /*
> +  * Clear all @info fields except for @seq, which is used to determine
> +  * the new sequence number. The writer must fill in new values.
> +  */
> + d->info.ts_nsec = 0;
> + d->info.text_len = 0;
> + d->info.dict_len = 0;
> + d->info.facility = 0;
> + d->info.flags = 0;
> + d->info.level = 0;
> + d->info.caller_id = 0;

This will be easy to miss when a new field is added in the future.

I would prefer to store @seq into temporary variable and clear
the entire structure by memset(). The new sequence number is
calculated few lines below anyway.


Otherwise, it looks good to me.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH][next] docs: vmcoreinfo: add lockless printk ringbuffer vmcoreinfo

2020-09-01 Thread Petr Mladek
On Fri 2020-08-14 23:39:16, John Ogness wrote:
> With the introduction of the lockless printk ringbuffer, the
> VMCOREINFO relating to the kernel log buffer was changed. Update the
> documentation to match those changes.
> 
> Fixes: ("printk: use the lockless ringbuffer")
> Signed-off-by: John Ogness 
> Reported-by: Nick Desaulniers 

The patch is committed in printk/linux.git, branch printk-rework.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v2 5/7][next] printk: ringbuffer: add finalization/extension support

2020-08-28 Thread Petr Mladek
On Thu 2020-08-27 12:04:58, John Ogness wrote:
> On 2020-08-26, Petr Mladek  wrote:
> >> This series makes a very naive assumption that the previous
> >> descriptor is either in the reserved or committed queried states. The
> >> fact is, it can be in any of the 4 queried states. Adding support for
> >> finalization of all the states then gets quite complex, since any
> >> state transition (cmpxchg) may have to deal with an unexpected FINAL
> >> flag.
> >
> > It has to be done in two steps to avoid race:
> >
> > prb_commit()
> >
> >+ set PRB_COMMIT_MASK
> >+ check if it is still the last descriptor in the array
> >+ set PRB_FINAL_MASK when it is not the last descriptor
> >
> > It should work because prb_reserve() finalizes the previous
> > descriptor after the new one is reserved. As a result:
> >
> >+ prb_reserve() should either see PRB_COMMIT_MASK in the previous
> >  descriptor and be able to finalize it.
> >
> >+ or prb_commit() will see that the head moved and it is not
> >  longer the last reserved one.
> 
> I do not like the idea of relying on descriptors to finalize
> themselves. I worry that there might be some hole there. Failing to
> finalize basically disables printk, so that is pretty serious.
> 
> Below is a patch against this series that adds support for finalizing
> all 4 queried states. It passes all my tests. Note that the code handles
> 2 corner cases:
> 
> 1. When seq is 0, there is no previous descriptor to finalize. This
>exception is important because we don't want to finalize the -1
>placeholder. Otherwise, upon the first wrap, a descriptor will be
>prematurely finalized.
> 
> 2. When a previous descriptor is being reserved for the first time, it
>might have a state_var value of 0 because the writer is still in
>prb_reserve() and has not set the initial value yet. I added
>considerable comments on this special case.
> 
> I am comfortable with adding this new code, although it clearly adds
> complexity.
> 
> John Ogness
> 
> diff --git a/kernel/printk/printk_ringbuffer.c 
> b/kernel/printk/printk_ringbuffer.c
> index 90d48973ac9e..1ed1e9eb930f 100644
> --- a/kernel/printk/printk_ringbuffer.c
> +++ b/kernel/printk/printk_ringbuffer.c
> @@ -860,9 +860,11 @@ static bool desc_reserve(struct printk_ringbuffer *rb, 
> unsigned long *id_out)
>   struct prb_desc_ring *desc_ring = >desc_ring;
>   unsigned long prev_state_val;
>   unsigned long id_prev_wrap;
> + unsigned long state_val;
>   struct prb_desc *desc;
>   unsigned long head_id;
>   unsigned long id;
> + bool is_final;
>  
>   head_id = atomic_long_read(_ring->head_id); /* LMM(desc_reserve:A) 
> */
>  
> @@ -953,10 +955,17 @@ static bool desc_reserve(struct printk_ringbuffer *rb, 
> unsigned long *id_out)
>* See "ABA Issues" about why this verification is performed.
>*/
>   prev_state_val = atomic_long_read(>state_var); /* 
> LMM(desc_reserve:E) */
> - if (prev_state_val &&
> - get_desc_state(id_prev_wrap, prev_state_val, NULL) != 
> desc_reusable) {
> - WARN_ON_ONCE(1);
> - return false;
> + if (get_desc_state(id_prev_wrap, prev_state_val, _final) != 
> desc_reusable) {
> + /*
> +  * If this descriptor has never been used, @prev_state_val
> +  * will be 0. However, even though it may have never been
> +  * used, it may have been finalized. So that flag must be
> +  * ignored.
> +  */
> + if ((prev_state_val & ~DESC_FINAL_MASK)) {
> + WARN_ON_ONCE(1);
> + return false;
> + }
>   }
>  
>   /*
> @@ -967,10 +976,25 @@ static bool desc_reserve(struct printk_ringbuffer *rb, 
> unsigned long *id_out)
>* any other changes. A write memory barrier is sufficient for this.
>* This pairs with desc_read:D.
>*/
> - if (!atomic_long_try_cmpxchg(>state_var, _state_val,
> -  id | 0)) { /* LMM(desc_reserve:F) */
> - WARN_ON_ONCE(1);
> - return false;
> + if (is_final)
> + state_val = id | 0 | DESC_FINAL_MASK;

The state from the previous wrap always have to have DESC_FINAL_MASK set.
Do I miss something, please?

> + else
> + state_val = id | 0;
> + if (atomic_long_cmpxchg(>state_var, prev_state_val,
> + state_val) != prev_state_val) { /* 
> LMM(desc_

Re: [PATCH v2 5/7][next] printk: ringbuffer: add finalization/extension support

2020-08-27 Thread Petr Mladek
On Thu 2020-08-27 12:04:58, John Ogness wrote:
> On 2020-08-26, Petr Mladek  wrote:
> >> This series makes a very naive assumption that the previous
> >> descriptor is either in the reserved or committed queried states. The
> >> fact is, it can be in any of the 4 queried states. Adding support for
> >> finalization of all the states then gets quite complex, since any
> >> state transition (cmpxchg) may have to deal with an unexpected FINAL
> >> flag.
> >
> > It has to be done in two steps to avoid race:
> >
> > prb_commit()
> >
> >+ set PRB_COMMIT_MASK
> >+ check if it is still the last descriptor in the array
> >+ set PRB_FINAL_MASK when it is not the last descriptor
> >
> > It should work because prb_reserve() finalizes the previous
> > descriptor after the new one is reserved. As a result:
> >
> >+ prb_reserve() should either see PRB_COMMIT_MASK in the previous
> >  descriptor and be able to finalize it.
> >
> >+ or prb_commit() will see that the head moved and it is not
> >  longer the last reserved one.
> 
> I do not like the idea of relying on descriptors to finalize
> themselves. I worry that there might be some hole there. Failing to
> finalize basically disables printk, so that is pretty serious.
> 
> Below is a patch against this series that adds support for finalizing
> all 4 queried states. It passes all my tests. Note that the code handles
> 2 corner cases:
> 
> 1. When seq is 0, there is no previous descriptor to finalize. This
>exception is important because we don't want to finalize the -1
>placeholder. Otherwise, upon the first wrap, a descriptor will be
>prematurely finalized.
> 
> 2. When a previous descriptor is being reserved for the first time, it
>might have a state_var value of 0 because the writer is still in
>prb_reserve() and has not set the initial value yet. I added
>considerable comments on this special case.
> 
> I am comfortable with adding this new code, although it clearly adds
> complexity.
> 
> John Ogness
> 
> diff --git a/kernel/printk/printk_ringbuffer.c 
> b/kernel/printk/printk_ringbuffer.c
> index 90d48973ac9e..1ed1e9eb930f 100644
> --- a/kernel/printk/printk_ringbuffer.c
> +++ b/kernel/printk/printk_ringbuffer.c
> @@ -967,10 +976,25 @@ static bool desc_reserve(struct printk_ringbuffer *rb, 
> unsigned long *id_out)
>* any other changes. A write memory barrier is sufficient for this.
>* This pairs with desc_read:D.
>*/
> - if (!atomic_long_try_cmpxchg(>state_var, _state_val,
> -  id | 0)) { /* LMM(desc_reserve:F) */
> - WARN_ON_ONCE(1);
> - return false;
> + if (is_final)
> + state_val = id | 0 | DESC_FINAL_MASK;
> + else
> + state_val = id | 0;
> + if (atomic_long_cmpxchg(>state_var, prev_state_val,
> + state_val) != prev_state_val) { /* 
> LMM(desc_reserve:F) */
> + /*
> +  * This reusable descriptor must have been finalized already.
> +  * Retry with a reusable+final expected value.
> +  */
> + prev_state_val |= DESC_FINAL_MASK;
> + state_val |= DESC_FINAL_MASK;
> +
> + if (!atomic_long_try_cmpxchg(>state_var, _state_val,
> +  state_val)) { /* 
> LMM(desc_reserve:FIXME) */
> +
> + WARN_ON_ONCE(1);
> + return false;

IMHO, this need to be done in a cycle until is succeeds. The previous
descriptor might get reopened or commited several times in the mean time.

I have played with the code from the original patchset in parallel
and came with alreanative solution that uses DESC_FINAL_MASK
as I originally thought. See below.

I still have to shake my head around both approaches.

Anwyay, here is my code against the original patchset:

>From 578a9ad6bdb536269cc03d7a65420a414ac35022 Mon Sep 17 00:00:00 2001
From: Petr Mladek 
Date: Thu, 27 Aug 2020 16:38:03 +0200
Subject: [PATCH] printk: Alternative solution for handling non-finalized
 descriptors

The original patchset allowed to set PRB_FINAL_MASK even when PRB_COMMIT_MASK
was not set. It was used in prb_reserve() to make sure that the previous
record will always get finalized.

The result is that there are more possible combinations of the bits
and more cmpxchg calls have to be used with different combinations
of bits.

This is easy in prb_commit(). It needs to set PRB_COMMIT_MASK.
PRB_FINAL_MASK could never get removed. So it is enough to try
both possibilities in the right order.

But it gets more problematic 

Re: [PATCH v2 5/7][next] printk: ringbuffer: add finalization/extension support

2020-08-27 Thread Petr Mladek
Hi,

this mail is based on my review that I did last two days. I haven't
seen the last code that tried to handle the finalize issues.
Anyway, this feedback might give some clueue as well.

IMPORTANT: It seems that we both had different understanding of
the DESC_FINAL_MASK behavior. It might explain some problems
and confusion. See my last comment in this mail for more.

On Mon 2020-08-24 12:41:36, John Ogness wrote:
> Add support for extending the last data block. For this, introduce a new
> finalization state flag that identifies if a descriptor may be extended.
> 
> When a writer calls the commit function prb_commit(), the record may still
> continue to be in the reserved queried state. In order for that record to
> enter into the committed queried state, that record also must be finalized.
> Finalization can occur anytime while the record is in the reserved queried
> state, even before the writer has called prb_commit().

The above paragraph is a bit confusing. The reserved state means that
that the record is being modified. prb_commit() sets DESC_COMMIT_MASK.
It means that the record is in a consistent state. It might get
reopened or finalized.

> Until a record is finalized (represented by "DESC_FINAL_MASK"), a writer
> may "reopen" that record and extend it with more data.

I would replace the above two paragrahs with something like:

"The original DESC_COMMIT_MASK is still set when the record is in
a consistent state. In this state, it can either get reopened or
finalized.

The new DESC_FINAL_MASK is set when the record can not longer get
modified. It might be set explicitely by the new prb_commit_finalize()
when the writer does not longer plan to append data. But it is also
set automatically when a new descriptor is allocated.

The automatic finalization simplifies the implementation. Only the
last datablock could get extended easily."


> Note that existing descriptors are automatically finalized whenever new
> descriptors are created. A record can never be "unfinalized".
> 
> Two new memory barrier pairs are introduced, but these are really just
> alternate path barriers that exactly correspond to existing memory
> barriers.

[...]

> diff --git a/kernel/printk/printk_ringbuffer.c 
> b/kernel/printk/printk_ringbuffer.c
> index d66718e74aae..90d48973ac9e 100644
> --- a/kernel/printk/printk_ringbuffer.c
> +++ b/kernel/printk/printk_ringbuffer.c
> @@ -79,6 +81,20 @@
>   * committed or reusable queried state. This makes it possible that a valid
>   * sequence number of the tail is always available.
>   *
> + * Descriptor Finalization
> + * ~~~
> + * When a writer calls the commit function prb_commit(), the record may still
> + * continue to be in the reserved queried state. In order for that record to
> + * enter into the committed queried state, that record also must be 
> finalized.
> + * Finalization can occur anytime while the record is in the reserved queried
> + * state, even before the writer has called prb_commit().

This has the same problem as the commit message ;-)

> + *
> + * Until a record is finalized (represented by "DESC_FINAL_MASK"), a writer
> + * may "reopen" that record and extend it with more data.
> + *
> + * Note that existing descriptors are automatically finalized whenever new
> + * descriptors are created. A record can never be "unfinalized".
> + *
>   * Data Rings
>   * ~~
>   * The two data rings (text and dictionary) function identically. They exist

[...]

> @@ -1018,6 +1071,77 @@ static char *data_alloc(struct printk_ringbuffer *rb,
>   return >data[0];
>  }
>  
> +/*
> + * Extend the allocation of a specified data block, invalidating the oldest
> + * data block(s) if necessary. This function also associates the data block
> + * with a specified descriptor.
> + */

data_realloc() actually allows to even reduce the buffer size. It is
used when the original buffer was larger than the original
info->text_len + requested text_buf_size.

I though a lot about it. It is a questionable behavior.

First, we did not expected this when designed the ring buffer. Well, I
do not see why it should not work. More free space in the ring buffer
does not hurt. And the cmpxchg in data_alloc() compares the exact
lpos.

Second, there is the idea to flush the existing pieces of continuous
lines to the consoles even before the record is finalized. For this,
we will need to make sure that the already committed text does not longer
change. The main responsibility has the caller (printk code). But
the ring buffer code will need to allow reading the already committed
text even when the buffer is being extended. But even this should work
when shrinking the ring buffer. It will be in the same wrap and
the reader would read only the part defined by the original info->text_len.


Anyway, the comment is too short. It does not describe what exactly
is done and the limitations. I suggest something like:

/*
 * Try to resize an existing data block defined by the given 

Re: [PATCH v2 5/7][next] printk: ringbuffer: add finalization/extension support

2020-08-26 Thread Petr Mladek
On Wed 2020-08-26 14:43:22, John Ogness wrote:
> On 2020-08-26, Sergey Senozhatsky  wrote:
> >>> @@ -1157,6 +1431,14 @@ bool prb_reserve(struct prb_reserved_entry *e, 
> >>> struct printk_ringbuffer *rb,
> >>>   goto fail;
> >>>   }
> >>>  
> >>> + /*
> >>> +  * New data is about to be reserved. Once that happens, previous
> >>> +  * descriptors are no longer able to be extended. Finalize the
> >>> +  * previous descriptor now so that it can be made available to
> >>> +  * readers (when committed).
> >>> +  */
> >>> + desc_finalize(desc_ring, DESC_ID(id - 1));
> >>> +
> >>>   d = to_desc(desc_ring, id);
> >>>  
> >>>   /*
> >> 
> >> Apparently this is not enough to guarantee that past descriptors are
> >> finalized. I am able to reproduce a scenario where the finalization
> >> of a certain descriptor never happens. That leaves the descriptor
> >> permanently in the reserved queried state, which prevents any new
> >> records from being created. I am investigating.
> >
> > Good to know. I also run into problems:
> > - broken dmesg (and broken journalctl -f /dev/kmsg poll) and broken
> >   syslog read
> >
> > $ strace dmesg
> >
> > ...
> > openat(AT_FDCWD, "/dev/kmsg", O_RDONLY|O_NONBLOCK) = 3
> > lseek(3, 0, SEEK_DATA)  = 0
> > read(3, 0x55dda8c240a8, 8191)   = -1 EAGAIN (Resource temporarily 
> > unavailable)
> > close(3)= 0
> > syslog(10 /* SYSLOG_ACTION_SIZE_BUFFER */) = 524288
> > mmap(NULL, 528384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) 
> > = 0x7f43ea847000
> > syslog(3 /* SYSLOG_ACTION_READ_ALL */, "", 524296) = 0
> 
> Yes, this a consequence of the problem. The tail is in the reserved
> queried state, so readers will not advance beyond it.
> 
> This series makes a very naive assumption that the previous descriptor
> is either in the reserved or committed queried states. The fact is, it
> can be in any of the 4 queried states. Adding support for finalization
> of all the states then gets quite complex, since any state transition
> (cmpxchg) may have to deal with an unexpected FINAL flag.
>
> The ringbuffer was designed so that descriptors are completely
> self-contained. So adding logic where an action on one descriptor should
> affect another descriptor is far more complex than I initially expected.
> 
> Keep in mind the finalization concept satisfies 3 things:
> 
> - denote if a record can be extended (i.e. transition back to reserved)
> - denote if a reader may read the record
> - denote if a writer may recycle a record
>
> I have not yet given up on the idea of finalization (particularly
> because it allows mainline LOG_CONT behavior to be preserved locklessy),

In each case, we need to finalize the descriptor also in prb_commit()
when the descriptor is not longer the last reserved one.

It has to be done in two steps to avoid race:

prb_commit()

   + set PRB_COMMIT_MASK
   + check if it is still the last descriptor in the array
   + set PRB_FINAL_MASK when it is not the last descriptor

It should work because prb_reserve() finalizes the previous
descriptor after the new one is reserved. As a result:

   + prb_reserve() should either see PRB_COMMIT_MASK in the previous
 descriptor and be able to finalize it.

   + or prb_commit() will see that the head moved and it is not
 longer the last reserved one.

I keep my fingers crossed.

Best Regards,
Petr

PS: I am still in the middle of review. And I had a feeling that
this situatuation was not handled.

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v2 4/7][next] printk: ringbuffer: add BLK_DATALESS() macro

2020-08-25 Thread Petr Mladek
On Mon 2020-08-24 12:41:35, John Ogness wrote:
> Rather than continually needing to explicitly check @begin and @next
> to identify a dataless block, introduce and use a BLK_DATALESS()
> macro.
> 
> Signed-off-by: John Ogness 

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v2 3/7][next] printk: ringbuffer: relocate get_data()

2020-08-25 Thread Petr Mladek
On Mon 2020-08-24 12:41:34, John Ogness wrote:
> Move the internal get_data() function as-is above prb_reserve() so
> that a later change can make use of the static function.
> 
> Signed-off-by: John Ogness 

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v2 1/7][next] printk: ringbuffer: rename DESC_COMMITTED_MASK flag

2020-08-25 Thread Petr Mladek
On Mon 2020-08-24 12:41:32, John Ogness wrote:
> The flag DESC_COMMITTED_MASK has a much longer name compared to the
> other state flags and also is in past tense form, rather than in
> command form. Rename the flag to DESC_COMMIT_MASK in order to match
> the other state flags.

I am not a native speaker. But the command form sounds a bit misleading to me.

I see that the new name will make more sense after adding the state "FINAL".
But "FINAL" will not be a command form either.

I am fine with the change but I would prefer a better commit message.
What about something like?

"A comming support for continuous lines will allow to reopen records
with DESC_COMMITTED_MASK set. As a result, the flag will not longer
describe the final committed state. Rename it to DESC_COMMIT_MASK
as a preparation step."


With the updated commit message:

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v2 2/7][next] printk: ringbuffer: change representation of reusable

2020-08-25 Thread Petr Mladek
On Mon 2020-08-24 12:41:33, John Ogness wrote:
> The reusable queried state is represented by the combined flags:
> 
> DESC_COMMIT_MASK | DESC_REUSE_MASK
> 
> There is no reason for the DESC_COMMIT_MASK to be part of that
> representation. In particular, this will add confusion when more
> state flags are available.
> 
> Change the representation of the reusable queried state to just
> the DESC_REUSE_MASK flag.
> 
> Signed-off-by: John Ogness 

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH][next] docs: vmcoreinfo: add lockless printk ringbuffer vmcoreinfo

2020-08-20 Thread Petr Mladek
On Fri 2020-08-14 23:39:16, John Ogness wrote:
> With the introduction of the lockless printk ringbuffer, the
> VMCOREINFO relating to the kernel log buffer was changed. Update the
> documentation to match those changes.
> 
> Fixes: ("printk: use the lockless ringbuffer")
> Signed-off-by: John Ogness 
> Reported-by: Nick Desaulniers 

Looks nice.

Reviewed-by: Petr Mladek 

Thanks a lot for taking care of the crashdump related stuff.
It helps a lot.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [patch 0/2] timekeeping: NMI safe timekeeper enhancements

2020-08-20 Thread Petr Mladek
On Thu 2020-08-20 12:30:55, Thomas Gleixner wrote:
> Petr,
> 
> On Thu, Aug 20 2020 at 10:47, Petr Mladek wrote:
> > The interface is perfectly fine for printk() needs.
> 
> Good. So I suggest that I apply that on top of rc1 somewhere in tip and
> tag the top commit. So you can pull that tag into your printk branch and
> go wild.

Sounds good to me.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [patch 0/2] timekeeping: NMI safe timekeeper enhancements

2020-08-20 Thread Petr Mladek
On Fri 2020-08-14 12:19:33, Thomas Gleixner wrote:
> printk intends to store various timestamps (MONOTONIC, REALTIME, BOOTTIME)
> to make correlation of dmesg accross different machines easier.
> 
> The NMI safe timekeeper allows to retrieve these timestamps from any
> context.

For both patches:

Tested-by: Petr Mladek 

I am not familiar with the timekeeping code so that I could not
provide a valuable review. Anyway, the patches seem to work
as expected.

The interface is perfectly fine for printk() needs.


I tested them with the patch below. The first timestamps appear
as early as before:

[0.00] [0.00][1970-01-01T00:00:00][T0] kvm-clock: Using 
msrs 4b564d01 and 4b564d00
[0.00] [0.00][1970-01-01T00:00:00][T0] kvm-clock: cpu 0, 
msr 129c01001, primary cpu clock
[0.00] [0.00][1970-01-01T00:00:00][T0] kvm-clock: using 
sched offset of 56519600356309 cycles
[0.08] [0.08][1970-01-01T00:00:00][T0] clocksource: 
kvm-clock: mask: 0x max_cycles: 0x1cd42e4dffb, max_idle_ns: 
881590591483 ns
[0.22] [0.22][1970-01-01T00:00:00][T0] tsc: Detected 
2112.000 MHz processor
[0.000835] [0.000835][1970-01-01T00:00:00][T0] e820: update [mem 
0x-0x0fff] usable ==> reserved

The realtime timestamps get ready later as expected:

[0.073075] [0.073075][1970-01-01T00:00:00][T0] rcu: Adjusting 
geometry for rcu_fanout_leaf=16, nr_cpu_ids=3
[0.075847] [0.075847][1970-01-01T00:00:00][T0] NR_IRQS: 524544, 
nr_irqs: 448, preallocated irqs: 16
[0.076014] [0.076014][2020-08-20T07:58:46][T0] Console: colour 
dummy device 80x25
[0.076014] [0.076014][2020-08-20T07:58:46][T0] printk: console 
[tty0] enabled

Also suspend resume seems to work as expected. I tested it the follwing way:

echo core > /sys/power/pm_test
echo reboot > /sys/power/disk
echo disk > /sys/power/state

and the result is:

[  224.422540] [  224.422540][2020-08-20T08:02:31][ T5124] Disabling non-boot 
CPUs ...
[  224.424171] [  224.424171][2020-08-20T08:02:31][   T15] IRQ fixup: irq 21 
move in progress, old vector 36
[  224.425304] [  224.425304][2020-08-20T08:02:31][ T5124] smpboot: CPU 1 is 
now offline
[  224.426664] [  224.426664][2020-08-20T08:02:31][   T20] IRQ 21: no longer 
affine to CPU2
[  224.426685] [  224.426685][2020-08-20T08:02:31][   T20] IRQ 24: no longer 
affine to CPU2
[  224.426717] [  224.426717][2020-08-20T08:02:31][   T20] IRQ 27: no longer 
affine to CPU2
[  224.427765] [  224.427765][2020-08-20T08:02:31][ T5124] smpboot: CPU 2 is 
now offline
[  224.428016] [  224.428016][2020-08-20T08:02:31][ T5124] PM: hibernation: 
debug: Waiting for 5 seconds.
[  224.428218] [  229.618141][2020-08-20T08:02:36][ T5124] Enabling non-boot 
CPUs ...
   ^^^^^^  ^^
[  224.428560] [  229.618483][2020-08-20T08:02:36][ T5124] x86: Booting SMP 
configuration:
[  224.428564] [  229.618486][2020-08-20T08:02:36][ T5124] smpboot: Booting 
Node 0 Processor 1 APIC 0x1
[  224.428766] [  229.618688][2020-08-20T08:02:36][T0] kvm-clock: cpu 1, 
msr 129c01041, secondary cpu clock
[  224.449192] [  229.639115][2020-08-20T08:02:36][   T14] kvm-guest: 
stealtime: cpu 1, msr 17fbf2080


And here is the patch that I used for testing:

>From 39bdfebfa94fc55616fe23f2f0b80e06479b65e0 Mon Sep 17 00:00:00 2001
From: Petr Mladek 
Date: Thu, 20 Aug 2020 10:18:57 +0200
Subject: [PATCH] printk: Test using all three timestamps: mono, boot, real

Test timekeeper enhancements proposed at
https://lore.kernel.org/r/20200814101933.574326...@linutronix.de

Allow to store and show all three timestamp (mono, boot, real).

It is not final solution. The prefix takes 83 characters that might
make the real text invisible or hard to read. Fragments of seconds
are lost in when printing real time timestamp. Offset is not provided
for crashdump tools.

Only syslog interface is supported. The log can be seen on consoles
or via:

    dmesg -S

Signed-off-by: Petr Mladek 
---
 kernel/printk/printk.c | 40 ++--
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9b75f6bfc333..329f3595b024 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -366,7 +366,7 @@ enum log_flags {
 };
 
 struct printk_log {
-   u64 ts_nsec;/* timestamp in nanoseconds */
+   struct ktime_timestamps ts; /* timestamps */
u16 len;/* length of entire record */
u16 text_len;   /* length of text buffer */
u16 dict_len;   /* length of dictionary buffer */
@@ -443,7 +443,7 @@ static u64 clear_seq;
 static u32 clear_idx;
 
 #ifdef CONFIG_PRINTK_CALLER
-#define PREFIX_MAX 48
+#define PREFIX_MAX 48+14+21
 #else
 #define PREFIX_MAX 32
 #endif
@@ -614,7 +614,7 

Re: [RFC PATCH] printk: Change timestamp to triplet as mono, boot and real

2020-08-14 Thread Petr Mladek
On Thu 2020-08-13 20:31:55, Sergey Senozhatsky wrote:
> On (20/08/13 12:22), Petr Mladek wrote:
> >
> >   + would take more space (prefix + text vs. binary representation)
> 
> Dict buffer is allocated regardless of how we use it, and only printks
> from drivers/* (dev_printk*) add dict payload. It might be the case
> that on some (if not most) systems dict pages are not used 90% of times
> (if not 99%).
> 
> >   + not reliable because dict is currently dropped when no space
> 
> Well, in the perfect world this is a problem, but "maybe not having
> alternative timestamps sometimes" can be OK approach for people who
> wants to use those triplet timestamps.

I am afraid the people would see missing timestamps as a bug.
They want it because they want to corelate kernel and userspace logs.
The timestamps from realtime clock are supposed to make it
straightforward.

More importantly. I do not see a reasonable way how to handle it.
No timestamp is worse than timestamps from non-synchronized clock.
Mixing timestamps from different clocks (as fallback) would cause
a lot of confusion (more harm than good).

> But, in general, how real this problem is? What I sae so far (on my boxes)
> was that printk messages are longer than dict payload.

Yeah, I think that dict are less used on normal systems. But it might
be very different when a driver gets more verbose during debugging
and the amount of dev_printk() calls is much higher.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: POC: Alternative solution: Re: [PATCH 0/4] printk: reimplement LOG_CONT handling

2020-08-14 Thread Petr Mladek
On Fri 2020-08-14 10:22:35, John Ogness wrote:
> On 2020-08-14, Sergey Senozhatsky  wrote:
> > One thing that we need to handle here, I believe, is that the context
> > which crashes the kernel should flush its cont buffer, because the
> > information there is relevant to the crash:
> >
> > pr_cont_alloc_info();
> > pr_cont(, "1");
> > pr_cont(, "2");
> > >>
> >oops
> >   panic()
> > <<
> > pr_cont_flush();
> >
> > We better flush that context's pr_cont buffer during panic().
> 
> I am not convinced of the general usefulness of partial messages, but as
> long as we have an API that includes registration, usage, and
> deregistration of some sort of handle, then we leave the window open for
> such implementations.

Registering some handle is an interesting idea. But it rules out
buffers on the stack. And we should avoid dynamic allocation.
printk() must be reliable also when there is not enough memory.

Please, keep it simple and do not add dependencies on new subsystems.
Using external code in printk() is always a call for problems.
The called code must be lockless and must not call printk().

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: POC: Alternative solution: Re: [PATCH 0/4] printk: reimplement LOG_CONT handling

2020-08-14 Thread Petr Mladek
On Thu 2020-08-13 20:54:35, Sergey Senozhatsky wrote:
> On (20/08/13 10:41), Petr Mladek wrote:
> > > My concerns about this idea:
> > > 
> > > - What if the printk user does not correctly terminate the cont message?
> > >   There is no mechanism to allow that open record to be force-finalized
> > >   so that readers can read newer records.
> > 
> > This is a real problem. And it is the reason why the cont buffer is
> > currently flushed (finalized) by the next message from another context.
> 
> I understand that you think that this should be discussed and addressed
> later in a separate patch, but, since we are on pr_cont topic right now,
> can we slow down and maybe re-think what is actually expected from
> pr_cont()? IOW, have the "what is expect from this feature" thread?
> 
> For instance, is missing \n the one and only reason why printk-s from
> another context flush cont buffer now? Because I can see some more reasons
> for current behaviour and I'd like to question those reasons.
> 
> I think what Linus said a long time ago was that the initial purpose of
> pr_cont was
> 
>   pr_info("Initialize feature foo...");
>   if (init_feature_foo() == 0)
>   pr_cont("ok\n");
>   else
>   pr_cont("not ok\n");
> 
>   And if init_feature_foo() crashes the kernel then the first printk()
>   form panic() will flush the cont buffer.
> 
> We can handle this by realizing that new printk() message has LOG_NEWLINE
> and has different log_level (not pr_cont), maybe.

Yes, this is a handy behavior. But it is also complicated
on the implementation side. It requires that consoles are able to
print the existing part of the continuous line and print only
the rest later.

BTW: It used to work before the commit 5c2992ee7fd8a29d041 ("printk:
 remove console flushing special cases for partial buffered
 lines");

BTW2: It will be much easier to implement when only the last
 message can be partially shown on the consoles. Each console
 driver will need to track the position only in one message.

 Also it will be easier when the part of the message if stored
 in the main lockless ring buffer. Then the driver could just
 try to reread the last message and see if it was concatenated.


> Let's look at the more general case:
> 
> CPU0  .. CPU255
> pr_info("text");
>   pr_alert("boom\n");
> pr_cont("1);
> pr_cont("2\n");
> 
> Do we really need to preliminary flush CPU0 pr_cont buffer in this
> case? There is no connection between messages from CPU0 and CPU255.
> Maybe (maybe!) what matters here is keeping the order of messages
> per-context rather than globally system-wide?

Honestly, I have no idea how many newlines are missing. They are often
not noticed because the buffered message is flushed later by some
other one.

The chance that some other "random" message will flush the pending
message is much lower if we have many cont buffers per-context
and per-cpu.

I am not brave enough to add more cont buffers without some fallback
mechanism to flush them later (irq_work?, timer?) or without
audit of all callers.

Where the audit is implicit when all callers are converted to
the buffered printk API.


There is one more problem. Any buffering might cause that nobody
will be able to see the message when things go wrong. Flushing
during panic() might help but only when panic() is called
and when there are system-wide cont buffers.

By other words, the current pr_cont() behavior causes mixed output
from time to time. But it increases the chance to see the messages.
And it makes it easier to find them in crashdump. Perfect output
is nice. But it will not help when the messages gets lost.

All I want to say that it is not black and white.


My opinion:

I will leave the decision on John. If he thinks that converting
all pr_cont() users to a buffered API is easier I will be fine
with it. It was proposed and requested several times.

If John realizes that my proposal to allow to reopen committed
messages is easier, I will be fine with it as well. We could
create the buffered API and convert the most critical
users one by one later. Also the context information will
allow to connect the broken messages in userspace and
do not complicate the kernel side.

Anyway, the lockless printk() feature is more important for
me that a perfect output of continuous lines.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: POC: Alternative solution: Re: [PATCH 0/4] printk: reimplement LOG_CONT handling

2020-08-13 Thread Petr Mladek
On Thu 2020-08-13 12:35:47, John Ogness wrote:
> On 2020-08-13, Petr Mladek  wrote:
> > On Thu 2020-08-13 09:50:25, John Ogness wrote:
> >> On 2020-08-13, Sergey Senozhatsky  wrote:
> >> > This is not an unseen pattern, I'm afraid. And the problem here can
> >> > be more general:
> >> >
> >> >  pr_info("text");
> >> >  pr_cont("1");
> >> >  exception/IRQ/NMI
> >> >  pr_alert("text\n");
> >> >  pr_cont("2");
> >> >  pr_cont("\n");
> >> >
> >> > I guess the solution would be to store "last log_level" in task_struct
> >> > and get current (new) timestamp for broken cont line?
> >> 
> >> (Warning: new ideas ahead)
> >> 
> >> The fundamental problem is that there is no real association between
> >> the cont parts. So any interruption results in a broken record. If we
> >> really want to do this correctly, we need real association.
> 
> I believe I failed to recognize the fundamental problem. The fundamental
> problem is that the pr_cont() semantics are very poor. I now strongly
> believe that we need to fix those semantics by having the pr_cont() user
> take responsibility for buffering the message. Patching the ~2000
> pr_cont() users will be far easier than continuing to twist ourselves
> around this madness.

Yes, this would make the API much more reliable. But it is not completely
easy. It has been discussed several times. The most serious attempt
is 
https://lore.kernel.org/lkml/1541165517-3557-1-git-send-email-penguin-ker...@i-love.sakura.ne.jp/#t

lockdep.c is probably the most complicated pr_cont() user. The
conversion complicated the code a bit, see
https://lore.kernel.org/lkml/1541165517-3557-3-git-send-email-penguin-ker...@i-love.sakura.ne.jp/

Linus also wanted to have an API to flush part of the message
before some action and finalize it later. It would be usable
for things like:

pr_info("Doing something dangerous... ");
printk_flush();
do_dangerous_thing();
pr_cont("ok\n");

Where pr_cont() would allow to flush part of the message to the
consoles and add the end of the line later when it did not crash.

It would need some support in the lockless ring buffer API anyway.
But it is only nice to have from my POV.


> The alloc function would need to work for any context, but that would
> not be an issue. If the cont message started to get too large, pr_cont()
> could do its own flushing in between, while still holding on to the
> context information. If for some reason the alloc function could not
> allocate a buffer, all the pr_cont() calls could fallback to logging the
> individual cont parts.

Also this was discussed several times. The most acceptable solution
was to use "small" buffer on the stack (128 bytes or so).


> I believe this would solve all cont-related problems while also allowing
> the new ringbuffer to remain as it already is in linux-next.

You are right. The buffering is really good step. It would make
the API sane and reliable. It has been proposed and discussed many
times. The main obstacle was that there was no volunteer to get this
done. If you are able to do so, it would be amazing.

I am just not sure what is more complicated. Getting lockless
operation correctly is always tricky. But going though 2000
callers individually is also a lot of work.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH] printk: Change timestamp to triplet as mono, boot and real

2020-08-13 Thread Petr Mladek
On Thu 2020-08-13 10:55:00, Sergey Senozhatsky wrote:
> On (20/08/11 15:02), Petr Mladek wrote:
> > On Tue 2020-08-11 14:05:12, Thomas Gleixner wrote:
> > > Petr Mladek  writes:
> > > > At least "crash" tool would need an update anyway. AFAIK, it checks
> > > > the size of struct printk_log and refuses to read it when it changes.
> > > >
> > > > It means that the hack with VMCOREINFO_FIELD_OFFSET probably is not
> > > > needed because we would need to update the crashdump-related tools 
> > > > anyway.
> > > >
> > > > Well, the timing is good. We are about to switch the printk ring
> > > > buffer into a lockless one. It requires updating the crashdump tools
> > > > as well. We could do this at the same time. The lockless ring buffer
> > > > already is in linux-next. It is aimed for 5.10 or 5.11.
> > > ...
> > > > It would be great to synchronize all these changes changes of the
> > > > printk log buffer structures.
> > > 
> > > I agree that having one update is a good thing, but pretty please can we
> > > finally make progress with this and not create yet another dependency?
> > 
> > To make it clear. I definitely do not want to block lockless printk by
> > this.
> > 
> > BTW: I am not 100% convinced that storing all three timestamps is
> > worth it. It increases the code complexity, metadata size. It needs
> > an interface with the userspace that has to stay backward compatible.
> 
> Can we, perhaps, store those various "alternative" timestamps in dict so
> then whoever wants to read them can just parse the dict key:value pairs
> attach to each printk message?

Interesting idea. It might be a way how to add optional metadata
without breaking compatibility with crashdump tools.

Well, I have bad feeling about it. Some of the reasons might be:

  + would take more space (prefix + text vs. binary representation)
  + not reliable because dict is currently dropped when no space
  + it would make the controversial dictionary feature more important


I would prefer to solve this by storing the timestamps in the
structure with metadata.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: POC: Alternative solution: Re: [PATCH 0/4] printk: reimplement LOG_CONT handling

2020-08-13 Thread Petr Mladek
On Thu 2020-08-13 09:50:25, John Ogness wrote:
> On 2020-08-13, Sergey Senozhatsky  wrote:
> > This is not an unseen pattern, I'm afraid. And the problem here can
> > be more general:
> >
> > pr_info("text");
> > pr_cont("1");
> > exception/IRQ/NMI
> > pr_alert("text\n");
> > pr_cont("2");
> > pr_cont("\n");
> >
> > I guess the solution would be to store "last log_level" in task_struct
> > and get current (new) timestamp for broken cont line?
> 
> (Warning: new ideas ahead)
> 
> The fundamental problem is that there is no real association between
> the cont parts. So any interruption results in a broken record. If we
> really want to do this correctly, we need real association.
> 
> With the new finalize flag for records, I thought about perhaps adding
> support for chaining data blocks.
> 
> A data block currently stores an unsigned long for the ID of the
> associated descriptor. But it could optionally include a second unsigned
> long, which is the lpos of the next text part. All the data blocks of a
> chain would point back to the same descriptor. The descriptor would only
> point to the first data block of the chain and include a flag that it is
> using chained data blocks.
>
> Then we would only need to track the sequence number of the open record
> and new data blocks could be added to the data block chain of the
> correct record. Readers cannot see the record until it is finalized.

Note that we still must try to append the continuous piece into the
same data block. We could not concatenate them in the reader API
because it would create the problem with unused/skipped sequence numbers.

Also it would complicate the reader code. It would need to know
whether a record has already been printed together with a previous
one.


> Also, since only finalized records can be invalidated, there are no
> races of chains becoming invalidated while being appended.
> 
> My concerns about this idea:
> 
> - What if the printk user does not correctly terminate the cont message?
>   There is no mechanism to allow that open record to be force-finalized
>   so that readers can read newer records.

This is a real problem. And it is the reason why the cont buffer is
currently flushed (finalized) by the next message from another context.


> - For tasks, the sequence number of the open record could be stored on
>   the task_struct. For non-tasks, we could use a global per-cpu variable
>   where each CPU stores 2 sequence numbers: the sequence number of the
>   open record for the non-task and the sequence number of the open
>   record for an interrupting NMI. Is that sufficient?

Yeah, it would be possible.

Anyway, this would fix an already existing problem. It might get
complicated and tricky. I am afraid that it comes from the "perfection
is the enemy of good" department.

A good compromise might be to just store "loglevel" from the previous
message in the given context. It could then be used for pr_cont()
messages in the same context.

Anyway, I would solve this later because it is already existing
problem.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: POC: Alternative solution: Re: [PATCH 0/4] printk: reimplement LOG_CONT handling

2020-08-13 Thread Petr Mladek
On Thu 2020-08-13 14:18:53, Sergey Senozhatsky wrote:
> On (20/08/13 02:30), John Ogness wrote:
> > 2. I haven't yet figured out how to preserve calling context when a
> > newline appears. For example:
> > 
> > pr_info("text");
> > pr_cont(" 1");
> > pr_cont(" 2\n");
> > pr_cont("3");
> > pr_cont(" 4\n");
> >
> > For "3" the calling context (info, timestamp) is lost because with "2"
> > the record is finalized. Perhaps the above is invalid usage of LOG_CONT?

If I get it correctly, the original code has the same problem.

The cont buffer is flushed when the cont piece ends with newline:

static bool cont_add(u32 caller_id, int facility, int level,
 enum log_flags flags, const char *text, size_t len)
{
   [...]

// The original flags come from the first line,
// but later continuations can add a newline.
if (flags & LOG_NEWLINE) {
cont.flags |= LOG_NEWLINE;
cont_flush();
}

return true;
}

cont_flush sets cont.len = 0;

static void cont_flush(void)
{
[...]
cont.len = 0;
}


The messages is appended only when cont.len != 0 in log_output:

static size_t log_output(int facility, int level, enum log_flags lflags, const 
char *dict, size_t dictlen, char *text, size_t text_len)
{
const u32 caller_id = printk_caller_id();

/*
 * If an earlier line was buffered, and we're a continuation
 * write from the same context, try to add it to the buffer.
 */
if (cont.len) {
if (cont.caller_id == caller_id && (lflags & LOG_CONT)) {
if (cont_add(caller_id, facility, level, lflags, text, 
text_len))
return text_len;
}

  [...]
}


Also the original context is overridden when the cont buffer is empty:

static bool cont_add(u32 caller_id, int facility, int level,
 enum log_flags flags, const char *text, size_t len)
{
[...]

if (!cont.len) {
cont.facility = facility;
cont.level = level;
cont.caller_id = caller_id;
cont.ts_nsec = local_clock();
cont.flags = flags;
}

[...]
}

So I would ignore this problem for now.


> This is not an unseen pattern, I'm afraid. And the problem here can
> be more general:
> 
>   pr_info("text");
>   pr_cont("1");
>   exception/IRQ/NMI
>   pr_alert("text\n");
>   pr_cont("2");
>   pr_cont("\n");
> 

Good point.


> I guess the solution would be to store "last log_level" in task_struct
> and get current (new) timestamp for broken cont line?

I think about storing the context in per-CPU and per-context array.
It should be more memory efficient than task_struct and it should
solve even the above problem.


> We have this problem now. E.g. early boot messages from one of my boxes:
> 
> 6,173,41837,-;x86: Booting SMP configuration:
> 6,174,41838,-; node  #0, CPUs:  #1 #2 #3 #4
> 4,175,44993,-;MDS CPU bug present and SMT on, data leak possible. See 
> https://www.kernel.org/doc/...
> 4,176,44993,c; #5 #6 #7
> 
> "CPUs: #1 #2 #3 #4 #5 #6 #7" is supposed to be one cont line with
> loglevel 6. But it gets interrupted and flushed so that "#5 #6 #7"
> appears with the different loglevel.

Nice example. It would be nice to fix this. But it should be done
separately.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: POC: Alternative solution: Re: [PATCH 0/4] printk: reimplement LOG_CONT handling

2020-08-13 Thread Petr Mladek
On Thu 2020-08-13 02:30:02, John Ogness wrote:
> On 2020-08-12, Petr Mladek  wrote:
> > So, I have one crazy idea to add one more state bit so that we
> > could have:
> >
> >   + committed: set when the data are written into the data ring.
> >   + final: set when the data block could not longer get reopened
> >   + reuse: set when the desctiptor/data block could get reused
> >
> > "final" bit will define when the descriptor could not longer
> > get reopened (cleared committed bit) and the data block could
> > not get extended.
> 
> I implemented this solution for myself and am currently running more
> tests. Some things that I changed from your suggestion:
> 
> 1. I created a separate prb_reserve_cont() function. The reason for this
> is because the caller needs to understand what is happening. The caller
> is getting an existing record with existing data and must append new
> data. The @text_len field of the info reports how long the existing data
> is.

I see.

> So the LOG_CONT handling code in printk.c looks something like this:

Yeah, it makes sense.


> 2. I haven't yet figured out how to preserve calling context when a
> newline appears. For example:
> 
> pr_info("text");
> pr_cont(" 1");
> pr_cont(" 2\n");
> pr_cont("3");
> pr_cont(" 4\n");
> 
> For "3" the calling context (info, timestamp) is lost because with "2"
> the record is finalized. Perhaps the above is invalid usage of LOG_CONT?

I am going to answer Sergey's reply.


> 3. There are some memory barriers introduced, but it looks like it
> shouldn't add too much complexity.

Uff. I have hooped for this.

> I will continue to refine my working version and post a patch so that we
> have something to work with. This looks to be the most promising way
> forward. Thanks.

Uff, I am happy that it seems working. Several other approaches looked
much more complicated or they caused regressions.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


POC: Alternative solution: Re: [PATCH 0/4] printk: reimplement LOG_CONT handling

2020-08-12 Thread Petr Mladek
On Tue 2020-08-11 18:05:51, Petr Mladek wrote:
> On Sat 2020-07-18 16:48:55, John Ogness wrote:
> > On 2020-07-17, Linus Torvalds  wrote:
> > > Make sure you test the case of "fast concurrent readers". The last
> > > time we did things like this, it was a disaster, because a concurrent
> > > reader would see and return the _incomplete_ line, and the next entry
> > > was still being generated on another CPU.
> > >
> > > The reader would then decide to return that incomplete line, because
> > > it had something.
> > >
> > > And while in theory this could then be handled properly in user space,
> > > in practice it wasn't. So you'd see a lot of logging tools that would
> > > then report all those continuations as separate log events.
> > >
> > > Which is the whole point of LOG_CONT - for that *not* to happen.
> > 
> > I expect this is handled correctly since the reader is not given any
> > parts until a full line is ready, but I will put more focus on testing
> > this to make sure. Thanks for the regression and testing tips.
> 
> Hmm, the current patchset has different problem. The continuation
> pieces are correctly passed as a single lines. But empty line is
> printed for each unused sequence number to avoid warnings about
> missed messages in journactl. It looks like:
> 
> I am afraid that the only working solution is to store all pieces
> in a single lockless transaction. I think that John already
> proposed using 2nd small lockless buffer for this. The problem
> might be how to synchronize flushing the pieces into the final
> buffer.

Do not panic! It might look scary. But I am less scared
after I wrote some pieces of the pseudo code.

So, I have one crazy idea to add one more state bit so that we
could have:

  + committed: set when the data are written into the data ring.
  + final: set when the data block could not longer get reopened
  + reuse: set when the desctiptor/data block could get reused


"final" bit will define when the descriptor could not longer
get reopened (cleared committed bit) and the data block could
not get extended.

The logic would be the following:

bool prb_reserve() {

desc = try_reopen_desc(seq);
if (desc) {
text_buf = data_alloc_continuous();
if (text_buf)
goto success;
else
/* commit the reopened desc back again */
prb_commit(desc);
}

/* Otherwise, do as before */
desc = desc_reserve();
if (!desc)
goto fail;

text_buf = data_alloc();
...

where:

static struct prb_desc *try_reopen_desc(seq)
{
struct prb_desc *desc;

enum desc_state d_state;
struct prb_desc desc;

d_state = desc_read(desc_ring, seq, );
if (d_state != committed_and_not_finalized)
return NULL;

if (!is_same_context(desc))
return NULL;

/* try to reopen only when the state is still the same */
if(!atomic_long_cmpxchg_relaxed(state_var,
val_committed_and_not_finished,
val_reserved))
return NULL;

return desc;
}

static char *data_alloc_continuous()
{
/*
 * Same as data_alloc() with one added parameter:
 * unsigned long requested_begin_lpos;
 */

begin_lpos = atomic_long_read(_ring->head_lpos);

do {
if (begin_lpos != requested_begin_lpos)
return NULL;

... same as before

} while (!atomic_long_try_cmpxchg(_ring->head_lpos, _lpos,
  next_lpos)); /* LMM(data_alloc:A) */

if (requested_begin_lpos) {
/* only update tail lpos */
blk_lpos->next = next_lpos;
/* return pointer to the new data space */
return >data[0];
}

/* For completely new block do everything as before */
blk = to_block(data_ring, begin_lpos);
blk->id = id; /* LMM(data_alloc:B) */
...
}

void prb_commit_and_finalize()
{
/* Same as prb_commit() + it will set also 'final' bit */
}


Addintional changes in the code:

+ desc_resrved() will also set 'final' bit in the previous
  descriptor so that the descriptor could not longer get reopended
  once committed.

+ prb_commit_and_finalize() will be called instead of prb_commit()
  when the message ends with '\n'.

+ prb_read() will allow to read the data only when
  the state is "committed_and_finalized".

+ desc_make_reusable() can be called only when the desciptor
  is in "commited_and_finalized" state.


I am not su

Re: [PATCH 0/4] printk: reimplement LOG_CONT handling

2020-08-11 Thread Petr Mladek
On Sat 2020-07-18 16:48:55, John Ogness wrote:
> On 2020-07-17, Linus Torvalds  wrote:
> > Make sure you test the case of "fast concurrent readers". The last
> > time we did things like this, it was a disaster, because a concurrent
> > reader would see and return the _incomplete_ line, and the next entry
> > was still being generated on another CPU.
> >
> > The reader would then decide to return that incomplete line, because
> > it had something.
> >
> > And while in theory this could then be handled properly in user space,
> > in practice it wasn't. So you'd see a lot of logging tools that would
> > then report all those continuations as separate log events.
> >
> > Which is the whole point of LOG_CONT - for that *not* to happen.
> 
> I expect this is handled correctly since the reader is not given any
> parts until a full line is ready, but I will put more focus on testing
> this to make sure. Thanks for the regression and testing tips.

Hmm, the current patchset has different problem. The continuation
pieces are correctly passed as a single lines. But empty line is
printed for each unused sequence number to avoid warnings about
missed messages in journactl. It looks like:

Aug 11 17:08:16 sle15-sp1 kernel: x86/fpu: xstate_offset[3]:  832, 
xstate_sizes[3]:   64
Aug 11 17:08:16 sle15-sp1 kernel: x86/fpu: xstate_offset[4]:  896, 
xstate_sizes[4]:   64
Aug 11 17:08:16 sle15-sp1 kernel: x86/fpu: Enabled xstate features 0x1f, 
context size is 960 bytes, using 'compacted' format.
Aug 11 17:08:16 sle15-sp1 kernel: BIOS-provided physical RAM map:
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: BIOS-e820: [mem 
0x-0x0009fbff] usable
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: BIOS-e820: [mem 
0x0009fc00-0x0009] reserved
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: BIOS-e820: [mem 
0x000f-0x000f] reserved
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: 
[...]
Aug 11 17:08:16 sle15-sp1 kernel: kvm-clock: using sched offset of 
85305506196438 cycles
Aug 11 17:08:16 sle15-sp1 kernel: clocksource: kvm-clock: mask: 
0x max_cycles: 0x1cd42e4dffb, max_idle_ns: 88>
Aug 11 17:08:16 sle15-sp1 kernel: tsc: Detected 2112.000 MHz processor
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: e820: update [mem 0x-0x0fff] 
usable ==> reserved
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: e820: remove [mem 0x000a-0x000f] 
usable
Aug 11 17:08:16 sle15-sp1 kernel: last_pfn = 0x18 max_arch_pfn = 0x4
Aug 11 17:08:16 sle15-sp1 kernel: MTRR default type: write-back
Aug 11 17:08:16 sle15-sp1 kernel: MTRR fixed ranges enabled:
[...]
Aug 11 17:08:16 sle15-sp1 kernel: clocksource: refined-jiffies: mask: 
0x max_cycles: 0x, max_idle_ns: 7645519>
Aug 11 17:08:16 sle15-sp1 kernel: setup_percpu: NR_CPUS:8192 nr_cpumask_bits:3 
nr_cpu_ids:3 nr_node_ids:1
Aug 11 17:08:16 sle15-sp1 kernel: percpu: Embedded 508 pages/cpu s2043904 r8192 
d28672 u2097152
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: pcpu-alloc: s2043904 r8192 d28672 u2097152 
alloc=1*2097152
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: 
Aug 11 17:08:16 sle15-sp1 kernel: pcpu-alloc: [0] 0 [0] 1 [0] 2 
Aug 11 17:08:16 sle15-sp1 kernel: KVM setup async PF for cpu 0
Aug 11 17:08:16 sle15-sp1 kernel: kvm-stealtime: cpu 0, msr 17f9f2080
Aug 11 17:08:16 sle15-sp1 kernel: Built 1 zonelists, mobility grouping on.  
Total pages: 1031901
Aug 11 17:08:16 sle15-sp1 kernel: Policy zone: Normal


> > So this is just a heads-up that I will not pull something that breaks
> > LOG_CONT because it thinks "user space can handle it". No. User space
> > does not handle it, and we need to handle it for the user.

I am afraid that this not acceptable for Linus either because people
will report bugs and complain about regression.

Slightly better solution is to skip the unused sequence numbers. It
looks good in both dmesg/journalctl by default. Note that journalctl
shows the warnings about missing lines "only" with "-a" option.
Sigh, but I am afraid that this is still not acceptable.


Hmm, I can't find any simple or even working solution for maintaining
a monotonic sequence number a lockless way that would be the same
for all stored pieces.

I am afraid that the only working solution is to store all pieces
in a single lockless transaction. I think that John already
proposed using 2nd small 

Re: [RFC PATCH] printk: Change timestamp to triplet as mono, boot and real

2020-08-11 Thread Petr Mladek
On Tue 2020-08-11 14:05:12, Thomas Gleixner wrote:
> Petr Mladek  writes:
> > At least "crash" tool would need an update anyway. AFAIK, it checks
> > the size of struct printk_log and refuses to read it when it changes.
> >
> > It means that the hack with VMCOREINFO_FIELD_OFFSET probably is not
> > needed because we would need to update the crashdump-related tools anyway.
> >
> > Well, the timing is good. We are about to switch the printk ring
> > buffer into a lockless one. It requires updating the crashdump tools
> > as well. We could do this at the same time. The lockless ring buffer
> > already is in linux-next. It is aimed for 5.10 or 5.11.
> ...
> > It would be great to synchronize all these changes changes of the
> > printk log buffer structures.
> 
> I agree that having one update is a good thing, but pretty please can we
> finally make progress with this and not create yet another dependency?

To make it clear. I definitely do not want to block lockless printk by
this.

BTW: I am not 100% convinced that storing all three timestamps is
worth it. It increases the code complexity, metadata size. It needs
an interface with the userspace that has to stay backward compatible.

Also it still will be racy because the timestamp is taken when the message
is printed. It might be "long" before or after the event that
it talks about.

There is still the alternative to print all three timestamps regularly
for those interested. It is less user convenient but much easier
to maintain.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH] printk: Change timestamp to triplet as mono, boot and real

2020-08-11 Thread Petr Mladek
On Tue 2020-08-11 12:40:22, Orson Zhai wrote:
> From: Thomas Gleixner 
> 
> Timestamps printed in kernel log are retrieved by local_clock which reads
> jiffies as a referrence clock source.
> But it is diffcult to be synchronized with logs generated out of kernel,
> say some remote processors (Modem) in the Soc of mobile phones. 
> Jiffies will be unchanged when system is in suspend mode but Modem will
> not.
> 
> So timestamps are required to be compensated with suspend time in some
> complecated scenarios especially for Android system. As an initial
> implementation was rejected by Linus, Thomas made this patch [1] 2 yeas
> ago to replace ts_nsec with a struct ts that consists 3 types of time:
> mono, boot and real.
> 
> This is an updated version which comes from patch [1] written by Thomas
> and suggestion [2] about VMCORE_INFO given by Linus.
> 
> It provides the prerequisite code to print kernel logs with boot or real
> timestamp in the future.
>
> This patch has been tested in qemu-x86-system. One problem is the timestamp
> in kernel log will be printed [0.00] for longer time than before. 

This would be a regression. People put huge effort into having early boot
timestamps, see
https://lore.kernel.org/lkml/20180719205545.16512-1-pasha.tatas...@oracle.com/
Adding some active people from this patchset into CC.

I wonder if we could have these early timestamps also in the mono clock.

> 1 [0.00] microcode: microcode updated early to revision 0x28, date = 
> 2019-11-12 
> 2 [0.00] Linux version 5.8.0+ (root@spreadtrum) (gcc (Ubuntu 
> 7.5.0-3ubuntu1~18.04) 7.5.0, GNU ld (GNU Binutils for Ubuntu) 2.30) #2 SMP 
> Fri Aug 7 21:30:51 CST 2020
> 3 [0.00] Command line: BOOT_IMAGE=/vmlinuz-5.8.0+ 
> root=UUID=4d889bca-be18-433d-9b86-c1c1714cc293 ro quiet splash vt.handoff=1
> 4 [0.00] KERNEL supported cpus: 
> 5 [0.00]   Intel GenuineIntel
> 
> 223 [0.00]  Tracing variant of Tasks RCU enabled. 
> 224 [0.00] rcu: RCU calculated value of scheduler-enlistment delay is 
> 25 jiffies. 
> 225 [0.00] rcu: Adjusting geometry for rcu_fanout_leaf=16, 
> nr_cpu_ids=4 
> 226 [0.00] NR_IRQS: 524544, nr_irqs: 456, preallocated irqs: 16 
> 227 [0.059662] random: crng done (trusting CPU's manufacturer) 
>  
> 228 [0.059662] Console: colour dummy device 80x25 
> 229 [0.059662] printk: console [tty0] enabled 
> 230 [0.059662] ACPI: Core revision 20200528 
> 
> compared to old log:
> 
> 69 [0.00] reserve setup_data: [mem 
> 0x0001-0x00021fdf] usable
> 70 [0.00] efi: EFI v2.31 by American Megatrends
> 71 [0.00] efi: ESRT=0xdbf69818 ACPI=0xdaef8000 ACPI 2.0=0xdaef8000 
> SMBIOS=0xf0480
> 72 [0.00] SMBIOS 2.8 present.
> 73 [0.00] DMI: LENOVO ThinkCentre M4500t-N000/, BIOS FCKT58AUS 
> 09/17/2014
> 74 [0.00] tsc: Fast TSC calibration using PIT
> 75 [0.00] tsc: Detected 3292.477 MHz processor
> 76 [0.000503] e820: update [mem 0x-0x0fff] usable ==> reserved
> 
> 77 [0.000504] e820: remove [mem 0x000a-0x000f] usable
> 78 [0.000510] last_pfn = 0x21fe00 max_arch_pfn = 0x4
> 79 [0.000513] MTRR default type: uncachable

[...]

> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -366,7 +366,8 @@ enum log_flags {
>  };
>  
>  struct printk_log {
> - u64 ts_nsec;/* timestamp in nanoseconds */
> + /* Timestamps in nanoseconds */
> + struct timestamps ts;
>   u16 len;/* length of entire record */
>   u16 text_len;   /* length of text buffer */
>   u16 dict_len;   /* length of dictionary buffer */
> @@ -1090,7 +1091,9 @@ void log_buf_vmcoreinfo_setup(void)
>* parse it and detect any changes to structure down the line.
>*/
>   VMCOREINFO_STRUCT_SIZE(printk_log);
> - VMCOREINFO_OFFSET(printk_log, ts_nsec);
> + /* ts.mono used to be called "ts_nsec" */
> + VMCOREINFO_FIELD_OFFSET(printk_log, ts_nsec, ts.mono);
> + VMCOREINFO_OFFSET(printk_log, ts);
>   VMCOREINFO_OFFSET(printk_log, len);
>   VMCOREINFO_OFFSET(printk_log, text_len);
>   VMCOREINFO_OFFSET(printk_log, dict_len);

At least "crash" tool would need an update anyway. AFAIK, it checks
the size of struct printk_log and refuses to read it when it changes.

It means that the hack with VMCOREINFO_FIELD_OFFSET probably is not
needed because we would need to update the crashdump-related tools anyway.

Well, the timing is good. We are about to switch the printk ring
buffer into a lockless one. It requires updating the crashdump tools
as well. We could do this at the same time. The lockless ring buffer
already is in linux-next. It is aimed for 5.10 or 5.11. We are still
discussion handling of continuous lines that might need another change
in the internal structures, see

Re: [PATCH v2][next] printk: ringbuffer: support dataless records

2020-08-04 Thread Petr Mladek
On Tue 2020-07-21 15:31:28, John Ogness wrote:
> With commit ("printk: use the lockless ringbuffer"), printk()
> started silently dropping messages without text because such
> records are not supported by the new printk ringbuffer.
> 
> Add support for such records.
> 
> Currently dataless records are denoted by INVALID_LPOS in order
> to recognize failed prb_reserve() calls. Change the ringbuffer
> to instead use two different identifiers (FAILED_LPOS and
> NO_LPOS) to distinguish between failed prb_reserve() records and
> successful dataless records, respectively.
> 
> diff --git a/kernel/printk/printk_ringbuffer.c 
> b/kernel/printk/printk_ringbuffer.c
> index 7355ca99e852..0659b50872b5 100644
> --- a/kernel/printk/printk_ringbuffer.c
> +++ b/kernel/printk/printk_ringbuffer.c
>  static bool data_check_size(struct prb_data_ring *data_ring, unsigned int 
> size)
>  {
>   struct prb_data_block *db = NULL;
>  
> - /*
> -  * Writers are not allowed to write data-less records. Such records
> -  * are used only internally by the ringbuffer to denote records where
> -  * their data failed to allocate or have been lost.
> -  */
>   if (size == 0)
> - return false;
> + return true;

Nit: This might deserve a comment why size == 0 is handled
 a special way.specially. I think about something like:

/*
 * Empty data blocks are handled by special lpos values in
 * the record descriptor. No space is needed in the data ring.
 */

or simply

/* Data-less records take no space in the data ring. */

>   /*
>* Ensure the alignment padded size could possibly fit in the data

> @@ -1025,6 +1020,10 @@ static char *data_alloc(struct printk_ringbuffer *rb,
>  static unsigned int space_used(struct prb_data_ring *data_ring,
>  struct prb_data_blk_lpos *blk_lpos)
>  {
> + /* Data-less blocks take no space. */
> + if (LPOS_DATALESS(blk_lpos->begin))
> + return 0;

Nit: It seems that all the other locations check also blk_lpos->next,
 for example, get_data() does:

if (LPOS_DATALESS(blk_lpos->begin) && LPOS_DATALESS(blk_lpos->next)) {


 Both approaches are error prone. I would either simplify the
 other locations and check only lpos->begin. But better might
 be to be on the safe side do a paranoid check, like:

bool is_dataless(struct prb_data_blk_lpos *blk_lpos)
{
if (LPOS_DATALESS(blk_lpos->begin) || LPOS_DATALESS(blk_lpos->next)) {
WARN_ON_ONCE(blk_lpos->begin != blk_lpos->next);
return true;
}

return false;
}

> +
>   if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, 
> blk_lpos->next)) {
>   /* Data block does not wrap. */
>   return (DATA_INDEX(data_ring, blk_lpos->next) -

Anyway, the patch looks fine. It is already pushed in
printk/linux.git. So, if you agree with my nits, we should
solve them with separate patches on top of the existing ones.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v4 0/4] printk: replace ringbuffer

2020-07-10 Thread Petr Mladek
On Fri 2020-07-10 11:58:34, John Ogness wrote:
> On 2020-07-10, Petr Mladek  wrote:
> >> The next series in the printk-rework (move LOG_CONT handling from
> >> writers to readers) makes some further changes that, while not
> >> incompatible, could affect the output of existing tools. It may be a
> >> good idea to let the new ringbuffer sit in linux-next until the next
> >> series has been discussed/reviewed/merged. After the next series,
> >> everything will be in place (with regard to userspace tools) to
> >> finish the rework.
> >
> > I know that it might be premature question. But I wonder what kind
> > of changes are expected because of the continuous lines.
> 
> I will be posting the next series quite soon, so I think it will be
> better to discuss it when we have a working example in front of us.

sure

> > Do you expect some changes in the ring buffer structures so that
> > the debugging tools would need yet another update to actually
> > access the data?
>
> The next series will be modifying the ringbuffer to allow data-less
> records. This is necessary to support the thousands of
> 
> pr_cont("\n");
> 
> calls in the kernel code. Failed dataring allocations will still be
> detected because the message flags for those records will be 0. For the
> above pr_cont() line, they will be LOG_NEWLINE|LOG_CONT.
> 
> Since the dump tools need to make changes for the new ringbuffer anyway,
> I think it would be good to hammer out the accepted LOG_CONT
> implementation first, just in case we do need to make any subtle
> internal changes.

Makes sense.


> > Or do you expect backward compatible changes that would allow
> > to pass related parts of the continuous lines via syslog/dev_kmsg
> > interface and join them later in userspace?
> 
> For users of console, non-extended netconsole, syslog, and kmsg_dump,
> there will be no external changes whatsoever. These interfaces have no
> awareness of sequence numbers, which will allow the kernel to
> re-assemble the LOG_CONT messages for them.

yup

> Users of /dev/kmsg and extended netconsole see sequence numbers. Offlist
> we discussed various hacks how to get around this without causing errors
> for existing software, but it was all ugly.
> 
> IMHO users of these sequence number interfaces need to see all the
> records individually and reassemble the LOG_CONT messages themselves if
> they want to. I believe that is the only sane path forward. To do this,
> the caller id will no longer be optional to the sequence number output
> since that is vital information to re-assemble the LOG_CONT messages.
> 
> Keep in mind that current software already needs to be able to handle
> the caller id being shown. Also, currently in mainline there is no
> guarantee that LOG_CONT messages are contiguous. So current software
> must also be ready to accept broken up LOG_CONT messages. This is why I
> think it would be acceptable to make this change for /dev/kmsg and
> extended netconsole. But I understand it is controversial since tools
> like systemd and dmesg use /dev/kmsg. Until they are modified to
> re-assemble LOG_CONT messages, they will present the user with the
> ugliness of LOG_CONT pieces (always, rather than as is now rarely).

I am not completely sure how often newline is used in common messages.
I hope that it is more frequently used in a debug code that has to
dump many details.

If my assumption is correct then your proposed approach should be well
acceptable. Normal people will not notice. Developers should be able
to deal with it, especially when a reasonable output might be seen
on consoles, via dmesg or crashdumnp.

Anyway, the proposal looks fine to me. I hope that it will work :-)
Let's continue the discussion when you send the patchset.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v4 0/4] printk: replace ringbuffer

2020-07-10 Thread Petr Mladek
On Thu 2020-07-09 09:09:31, John Ogness wrote:
> On 2020-07-08, Petr Mladek  wrote:
> > OK, I think that we are ready to try this in linux-next.
> > I am going to push it there via printk/linux.git.
> >
> > [...]
> > 
> > Of course, there are still many potential problems. The following comes
> > to my mind:
> >
> > [...]
> >
> >+ Debugging tools accessing the buffer directly would need to
> >  understand the new structure. Fortunately John provided
> >  patches for the most prominent ones.
> 
> The next series in the printk-rework (move LOG_CONT handling from
> writers to readers) makes some further changes that, while not
> incompatible, could affect the output of existing tools. It may be a
> good idea to let the new ringbuffer sit in linux-next until the next
> series has been discussed/reviewed/merged. After the next series,
> everything will be in place (with regard to userspace tools) to finish
> the rework.

I know that it might be premature question. But I wonder what kind
of changes are expected because of the continuous lines.

Do you expect some changes in the ring buffer structures so that
the debugging tools would need yet another update to actually
access the data?

Or do you expect backward compatible changes that would allow
to pass related parts of the continuous lines via syslog/dev_kmsg
interface and join them later in userspace?

IMHO, it would make sense to wait only when the structures need
some modification. Concatenating related parts on the userspace
side will need to stay optional anyway.

As I say, this might be premature question. I just do not want
to unnecessary delay mainlining the current state. It would get
much wider testing there. And it is great when the changes might
be done in "small" steps.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v5 0/4] printk: replace ringbuffer

2020-07-10 Thread Petr Mladek
On Thu 2020-07-09 15:29:40, John Ogness wrote:
> Hello,
> 
> Here is a v5 for the first series to rework the printk
> subsystem. The v4 is here [0]. This first series
> only replaces the existing ringbuffer implementation. No locking
> is removed. The semantics/behavior of printk are kept the same
> except for a minor optimization that is reverted (patch 3).
> 
> John Ogness (4):
>   crash: add VMCOREINFO macro to define offset in a struct declared by
> typedef
>   printk: add lockless ringbuffer
>   Revert "printk: lock/unlock console only for new logbuf entries"
>   printk: use the lockless ringbuffer

The patchset is committed in printk/linux.git, branch printk-rework.

I did not add any target kernel version into the topic branch name.
We could use it for the entire rework. The pieces would go into
mainline when we and linux-next are happy.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [printk] 18a2dc6982: ltp.kmsg01.fail

2020-07-09 Thread Petr Mladek
On Thu 2020-07-09 22:07:58, Sergey Senozhatsky wrote:
> On (20/07/09 14:25), Petr Mladek wrote:
> > On Thu 2020-07-09 13:23:07, John Ogness wrote:
> > > On 2020-07-09, Petr Mladek  wrote:
> > > > I though more about it. IMHO, it will be better to modify
> > > > prb_first_seq() to do the same cycle as prb_next_seq()
> > > > and return seq number of the first valid entry.
> > > 
> > > Exactly!
> > > 
> > > Here is a patch that does just that. I added a prb_first_valid_seq()
> > > function and made prb_first_seq() static. (The ringbuffer still needs
> > > prb_first_seq() for itself.)
> > 
> > The fix looks fine to me:
> 
> Yeah, looks right to me as well.
> 
> > It means that we have two fixes on top of the original patchset. Could
> > you please send v5 with the two fixes integrated? I would just squash
> > them into the 4th patch.
> 
> I'd prefer v5, if possible.

Ah, my paragraph was confusing. I'd prefer v5 as well.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [printk] 18a2dc6982: ltp.kmsg01.fail

2020-07-09 Thread Petr Mladek
On Thu 2020-07-09 13:23:07, John Ogness wrote:
> On 2020-07-09, Petr Mladek  wrote:
> > I though more about it. IMHO, it will be better to modify
> > prb_first_seq() to do the same cycle as prb_next_seq()
> > and return seq number of the first valid entry.
> 
> Exactly!
> 
> Here is a patch that does just that. I added a prb_first_valid_seq()
> function and made prb_first_seq() static. (The ringbuffer still needs
> prb_first_seq() for itself.)

The fix looks fine to me:

Reviewed-by: Petr Mladek 

It means that we have two fixes on top of the original patchset. Could
you please send v5 with the two fixes integrated? I would just squash
them into the 4th patch.

Best Regards,
Petr


PS: I know that I am hurrying maybe too much at the moment.
Well, I will have vacation last two week in July. I think that this is
in good enough state for linux-next and I would like to have it there
at least few days before I leave.

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [printk] 18a2dc6982: ltp.kmsg01.fail

2020-07-09 Thread Petr Mladek
On Thu 2020-07-09 12:59:06, Petr Mladek wrote:
> On Thu 2020-07-09 12:20:35, John Ogness wrote:
> > On 2020-07-09, Sergey Senozhatsky  wrote:
> > > On (20/07/09 15:14), kernel test robot wrote:
> > > [..]
> > >
> > > Took me a while to find the FAIL-ed test:
> > >
> > >> kmsg01.c:393: INFO: TEST: read returns EPIPE when messages get 
> > >> overwritten
> > >> kmsg01.c:398: INFO: first seqno: 0
> > >> kmsg01.c:411: INFO: first seqno now: 881
> > >> kmsg01.c:425: FAIL: read returned: 77: SUCCESS (0)
> > >
> > > So this is seq number related
> > > https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/logging/kmsg/kmsg01.c#L383
> > 
> > Excellent test.
> > 
> > Since the messages are above the expected average size, the dataring is
> > wrapping before the descriptor ring. This means that the initial
> > descriptors are still there, but their data is gone. Initially I would
> > generate an EPIPE for this, but it was changed. Here is the thread [0]
> > we had about this.
> 
> I see. IMHO, the following should do the job. The check is done only
> when the above prb_read_valid() succeeded. Therefore the printk_record
> has to include a valid value. And it must be the first valid record
> when some messages were lost.
> 
> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> index 62fc1abd9c4d..5d4760b5c671 100644
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -775,9 +775,9 @@ static ssize_t devkmsg_read(struct file *file, char 
> __user *buf,
>   logbuf_lock_irq();
>   }
>  
> - if (user->seq < prb_first_seq(prb)) {
> + if (user->seq < r->info->seq) {
>   /* our last seen message is gone, return error and reset */
> - user->seq = prb_first_seq(prb);
> + user->seq = r->info->seq;
>   ret = -EPIPE;
>   logbuf_unlock_irq();
>   goto out;

I though more about it. IMHO, it will be better to modify
prb_first_seq() to do the same cycle as prb_next_seq()
and return seq number of the first valid entry.

IMHO, basically any caller in printk.c expects this behavior.
For example, devkmsg user would expect reading valid entry after doing
SEEK_SET. I would also expect to get valid record right after opening
devkmsg, etc.

The current prb_first_seq() is needed only _prb_read_valid(). For,
this I would rename the original function to prb_tail_seq().

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [printk] 18a2dc6982: ltp.kmsg01.fail

2020-07-09 Thread Petr Mladek
On Thu 2020-07-09 12:20:35, John Ogness wrote:
> On 2020-07-09, Sergey Senozhatsky  wrote:
> > On (20/07/09 15:14), kernel test robot wrote:
> > [..]
> >
> > Took me a while to find the FAIL-ed test:
> >
> >> kmsg01.c:393: INFO: TEST: read returns EPIPE when messages get overwritten
> >> kmsg01.c:398: INFO: first seqno: 0
> >> kmsg01.c:411: INFO: first seqno now: 881
> >> kmsg01.c:425: FAIL: read returned: 77: SUCCESS (0)
> >
> > So this is seq number related
> > https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/logging/kmsg/kmsg01.c#L383
> 
> Excellent test.
> 
> Since the messages are above the expected average size, the dataring is
> wrapping before the descriptor ring. This means that the initial
> descriptors are still there, but their data is gone. Initially I would
> generate an EPIPE for this, but it was changed. Here is the thread [0]
> we had about this.

I see. IMHO, the following should do the job. The check is done only
when the above prb_read_valid() succeeded. Therefore the printk_record
has to include a valid value. And it must be the first valid record
when some messages were lost.

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 62fc1abd9c4d..5d4760b5c671 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -775,9 +775,9 @@ static ssize_t devkmsg_read(struct file *file, char __user 
*buf,
logbuf_lock_irq();
}
 
-   if (user->seq < prb_first_seq(prb)) {
+   if (user->seq < r->info->seq) {
/* our last seen message is gone, return error and reset */
-   user->seq = prb_first_seq(prb);
+   user->seq = r->info->seq;
ret = -EPIPE;
logbuf_unlock_irq();
goto out;


Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v4 0/4] printk: replace ringbuffer

2020-07-08 Thread Petr Mladek
On Tue 2020-07-07 17:05:28, John Ogness wrote:
> Hello,
> 
> Here is a v4 for the first series to rework the printk
> subsystem. The v3 is here [0]. This first series
> only replaces the existing ringbuffer implementation. No locking
> is removed. The semantics/behavior of printk are kept the same
> except for a minor optimization that is reverted (patch 3).
> 
> Despite minor changes to the ringbuffer code since v3 (comments,
> function names, very minor refactoring), the ringbuffer logic
> itself has not changed. And, in particular, the memory barriers
> have been exactly preserved from v3. For this reason I deem it
> appropriate to keep Paul's reviewed by tag (patch 2).
> 
> RFC patches for various userspace tools to dump the kernel log
> are available: crash [1], makedumpfile [2], kexec-tools [3].
> 
> Finally, I would like to thank some people/organizations that
> helped with performing ringbuffer stress tests on big or rare
> hardware that I do not have access to:
> 
> - Prarit Bhargava of Red Hat (x86_64, ppc64le power8)
> - Michael Cree of Debian (alpha)
> - Jeff Scheel of OSU Open Source Lab (ppc64le power8 kvm)

OK, I think that we are ready to try this in linux-next.
I am going to push it there via printk/linux.git.

I have a good feeling about the patchset. The great thing is that
the access is still synchronized using logbuf_lock so that we do not
have to deal with races for the moment.

Of course, there are still many potential problems. The following comes
to my mind:

   + Bugs in the algorithm logic or implementation might prevent
 showing any messages on consoles or via syslog or /dev/kmsg.
 We did our best to avoid it.

   + Debugging tools accessing the buffer directly would need to
 understand the new structure. Fortunately John provided
 patches for the most prominent ones.

   + Small devices might complain about less effective use of memory.
 Part of descriptors and dictionaries ring buffers might stay
 unused. But it hopefully could get tuned.


This is basically just a start of the journey. I hope that it will be a
good one.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v4 4/4] printk: use the lockless ringbuffer

2020-07-08 Thread Petr Mladek
On Tue 2020-07-07 17:05:32, John Ogness wrote:
> Replace the existing ringbuffer usage and implementation with
> lockless ringbuffer usage. Even though the new ringbuffer does not
> require locking, all existing locking is left in place. Therefore,
> this change is purely replacing the underlining ringbuffer.
> 
> Changes that exist due to the ringbuffer replacement:
> 
> - The VMCOREINFO has been updated for the new structures.
> 
> - Dictionary data is now stored in a separate data buffer from the
>   human-readable messages. The dictionary data buffer is set to the
>   same size as the message buffer. Therefore, the total required
>   memory for both dictionary and message data is
>   2 * (2 ^ CONFIG_LOG_BUF_SHIFT) for the initial static buffers and
>   2 * log_buf_len (the kernel parameter) for the dynamic buffers.
> 
> - Record meta-data is now stored in a separate array of descriptors.
>   This is an additional 72 * (2 ^ (CONFIG_LOG_BUF_SHIFT - 5)) bytes
>   for the static array and 72 * (log_buf_len >> 5) bytes for the
>   dynamic array.
> 
> Signed-off-by: John Ogness 

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v4 3/4] Revert "printk: lock/unlock console only for new logbuf entries"

2020-07-08 Thread Petr Mladek
On Tue 2020-07-07 17:05:31, John Ogness wrote:
> This reverts commit 3ac37a93fa9217e576bebfd4ba3e80edaaeb2289.
> 
> This optimization will not apply once the transition to a lockless
> printk is complete. Rather than porting this optimization through
> the transition only to remove it anyway, just revert it now to
> simplify the transition.
> 
> Signed-off-by: John Ogness 

Reviewed-by: Petr Mladek 

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v3 3/3] printk: use the lockless ringbuffer

2020-07-02 Thread Petr Mladek
On Thu 2020-07-02 17:43:22, lijiang wrote:
> 在 2020年07月02日 17:02, John Ogness 写道:
> > On 2020-07-02, lijiang  wrote:
> >> About the VMCOREINFO part, I made some tests based on the kernel patch
> >> v3, the makedumpfile and crash-utility can work as expected with your
> >> patch(userspace patch), but, unfortunately, the vmcore-dmesg(kexec-tools)
> >> can't correctly read the printk ring buffer information, and get the
> >> following error:
> >>
> >> "Missing the log_buf symbol"
> >>
> >> The kexec-tools(vmcore-dmesg) should also have a similar patch, just like
> >> in the makedumpfile and crash-utility.
> > 
> > Yes, a patch for this is needed (as well as for any other related
> > software floating around the internet).
> > 
> > I have no RFC patches for vmcore-dmesg. Looking at the code, I think it
> > would be quite straight forward to port the makedumpfile patch. I will
> 
> Yes, it should be a similar patch.
> 
> > try to make some time for this.
> > 
> That would be nice. Thank you, John Ogness.
> 
> > I do not want to patch any other software for this. I think with 3
> > examples (crash, makedumpfile, vmcore-dmesg), others should be able to
> 
> It's good enough to have the patch for the makedumpfile, crash and 
> vmcore-dmesg,
> which can ensure the kdump(userspace) work well.

I agree that this three are the most important ones and should be
enough.

Thanks a lot for working on it and testing it.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: buffer allocation: was: [PATCH v3 3/3] printk: use the lockless ringbuffer

2020-07-02 Thread Petr Mladek
On Mon 2020-06-29 23:57:59, John Ogness wrote:
> On 2020-06-29, Petr Mladek  wrote:
> >> @@ @@ void __init setup_log_buf(int early)
> >> +  prb_init(_rb_dynamic,
> >> +   new_log_buf, order_base_2(new_log_buf_len),
> >> +   new_dict_buf, order_base_2(new_log_buf_len),
> >> +   new_descs, order_base_2(new_descs_count));
> >
> > order_base_2() is safe. But the result might be tat some allocated
> > space is not used.
> >
> > I would prefer to make sure that new_log_buf_len is rounded, e.g.
> > by roundup_pow_of_two(), at the beginning of the function. Then we
> > could use ilog2() here.
> 
> new_log_buf_len can only be set within log_buf_len_update(), and it
> is already doing exactly what you want:
> 
> if (size)
> size = roundup_pow_of_two(size);
> if (size > log_buf_len)
> new_log_buf_len = (unsigned long)size;

I know. I though about repeating this in setup_log_bug() to make it
error proof. But it is not really needed. I do not resist on it.

> I can switch to ilog2() instead of the more conservative order_base_2().

Yes, please change it to ilog2().

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v3 2/3] printk: add lockless ringbuffer

2020-07-02 Thread Petr Mladek
On Thu 2020-06-18 16:55:18, John Ogness wrote:
> Introduce a multi-reader multi-writer lockless ringbuffer for storing
> the kernel log messages. Readers and writers may use their API from
> any context (including scheduler and NMI). This ringbuffer will make
> it possible to decouple printk() callers from any context, locking,
> or console constraints. It also makes it possible for readers to have
> full access to the ringbuffer contents at any time and context (for
> example from any panic situation).

Please, find few comments below. They are primary about improving
the comments. There are also 3 ideas how to clean up the code
a bit[*].

Everything might be solved by followup patches. Well, I would prefer
to update the comments in the next respin if you do it because
of the third patch.[**]

The small code changes can be done later. I do not want to introduce
eventual problems at this stage.

Feel free to use:

Reviewed-by: Petr Mladek 


[*] I have pretty good feeling about the code. And I really like the extensive
comments. The remaining ideas are mostly from the nits department.

[**] We are getting close to push this into linux-next. I think that
 it makes sense only when also the 3rd patch is ready.

 Also we should have a solution for the crashdump/crash tools at hands.
 But I think that this is already the case.


> --- /dev/null
> +++ b/kernel/printk/printk_ringbuffer.c
> + * Data Rings
> + * ~~
> + * The two data rings (text and dictionary) function identically. They exist
> + * separately so that their buffer sizes can be individually set and they do
> + * not affect one another.
> + *
> + * Data rings are byte arrays composed of data blocks. Data blocks are
> + * referenced by blk_lpos structs that point to the logical position of the
> + * beginning of a data block and the beginning of the next adjacent data
> + * block. Logical positions are mapped directly to index values of the byte
> + * array ringbuffer.
> + *
> + * Each data block consists of an ID followed by the raw data. The ID is the
> + * identifier of a descriptor that is associated with the data block. A data
> + * block is considered valid if all of the following conditions are met:
> + *
> + *   1) The descriptor associated with the data block is in the committed
> + *  or reusable queried state.

This sounds strange. The information in the descriptor is valid when
the descriptor is in the committed or reusable state. But the data
in the data block are valid only when the related descriptor is in
the committed state.

The data block might get reused faster than the descriptor that owned
it earlier.

> + *   2) The blk_lpos struct within the descriptor associated with the data
> + *  block references back to the same data block.

This is true. But this cross check is needed only when the assocoated
descriptor is found via the ID stored in the data block. It is not
needed when the data block is found via id/seq number stored in
the descriptor that is in commited state.


> + *   3) The data block is within the head/tail logical position range.

Also this is true. But it works by design. It does not need to be
checked when validating the data.

I would like to make this clear. So, I would write something like:


  The raw data stored in a data block can be considered valid when the
  the associated descriptor is in the committed state.

  The above condition is enough when the data block is found via
  information from the descriptor that was chosen by its ID.

  One more check is needed when the data block is found via
  blk_lpos.next stored in the descriptor for the previous data block.
  The data block might point to a descriptor in a committed state just
  by chance. It is the right one only when blk_lpos.begin points back
  to the data block.

I do not know. It might be to complicated. I actually did not mind
about the original text in previous versions. But I realize now that
it might have been the reason why the checks in data_make_reusable()
were too complicated in the previous versions.



> + * Sample reader code::
> + *
> + *   struct printk_info info;
> + *   struct printk_record r;
> + *   char text_buf[32];
> + *   char dict_buf[32];
> + *   u64 seq;
> + *
> + *   prb_rec_init_rd(, , _buf[0], sizeof(text_buf),
> + *   _buf[0], sizeof(dict_buf));
> + *
> + *   prb_for_each_record(0, _rb, , ) {
> + *   if (info.seq != seq)
> + *   pr_warn("lost %llu records\n", info.seq - seq);
> + *
> + *   if (info.text_len > r.text_buf_size) {
> + *   pr_warn("record %llu text truncated\n", info.seq);
> + *   text_buf[sizeof(text_buf) - 1] = 0;
> + *   }

AFAIK, the trailing '\0' is missing even the the text is not
truncated. We need to always 

Re: buffer allocation: was: [PATCH v3 3/3] printk: use the lockless ringbuffer

2020-06-29 Thread Petr Mladek
On Fri 2020-06-26 17:02:48, John Ogness wrote:
> On 2020-06-25, Petr Mladek  wrote:
> >> --- a/kernel/printk/printk.c
> >> +++ b/kernel/printk/printk.c
> >> +  free = __LOG_BUF_LEN;
> >> +  prb_for_each_record(0, _rb_static, seq, )
> >> +  free -= add_to_rb(_rb_dynamic, );
> >> +
> >> +  prb = _rb_dynamic;
> >
> > This might deserve a comment that this is safe (no lost message)
> > because it is called early enough when everything is still running
> > on the boot CPU.
> 
> I will add a comment and an extra check to make sure.
> 
> Once everything is lockless, new messages could appear (for example, due
> to NMI messages). The simple check should probably change to a loop. But
> let us not worry about that at this point.

Yup.

> Below is a new version of the relevant patch snippets against mainline
> just to show you how I plan to make it look for the next version.
> 
> John Ogness
> 
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ @@
> +#define PRB_AVGBITS 5/* 32 character average length */
> +
> +#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS
> +#error CONFIG_LOG_BUF_SHIFT value too small.
> +#endif
> +_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
> +  PRB_AVGBITS, PRB_AVGBITS, &__log_buf[0]);
> +
> @@ @@
>  void __init setup_log_buf(int early)
>  {
> + unsigned int new_descs_count;
> + struct prb_desc *new_descs;
> + struct printk_info info;
> + struct printk_record r;
> + size_t new_descs_size;
>   unsigned long flags;
> + char *new_dict_buf;
>   char *new_log_buf;
>   unsigned int free;
> + u64 seq;
>  
>   /*
>* Some archs call setup_log_buf() multiple times - first is very
> @@ @@ void __init setup_log_buf(int early)
>   if (!new_log_buf_len)
>   return;
>  
> + new_descs_count = new_log_buf_len >> PRB_AVGBITS;
> + if (new_descs_count == 0) {
> + pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len);
> + return;
> + }
> +
>   new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
>   if (unlikely(!new_log_buf)) {
> - pr_err("log_buf_len: %lu bytes not available\n",
> - new_log_buf_len);
> + pr_err("log_buf_len: %lu text bytes not available\n",
> +new_log_buf_len);
> + return;
> + }
> +
> + new_dict_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
> + if (unlikely(!new_dict_buf)) {
> + pr_err("log_buf_len: %lu dict bytes not available\n",
> +new_log_buf_len);
> + memblock_free(__pa(new_log_buf), new_log_buf_len);
>   return;
>   }
>  
> + new_descs_size = new_descs_count * sizeof(struct prb_desc);
> + new_descs = memblock_alloc(new_descs_size, LOG_ALIGN);
> + if (unlikely(!new_descs)) {
> + pr_err("log_buf_len: %lu desc bytes not available\n",
> +new_descs_size);
> + memblock_free(__pa(new_dict_buf), new_log_buf_len);
> + memblock_free(__pa(new_log_buf), new_log_buf_len);
> + return;
> + }
> +
> + prb_rec_init_rd(, ,
> + _text_buf[0], sizeof(setup_text_buf),
> + _dict_buf[0], sizeof(setup_dict_buf));
> +
> + prb_init(_rb_dynamic,
> +  new_log_buf, order_base_2(new_log_buf_len),
> +  new_dict_buf, order_base_2(new_log_buf_len),
> +  new_descs, order_base_2(new_descs_count));

order_base_2() is safe. But the result might be tat some allocated
space is not used.

I would prefer to make sure that new_log_buf_len is rounded, e.g.
by roundup_pow_of_two(), at the beginning of the function. Then we
could use ilog2() here.

Otherwise, it looks fine to me.

Best Regards,
Petr

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


  1   2   >