[PATCH v4 4.4 3/4] x86/fpu: Remove struct fpu::counter
From: Rik van Riel commit 3913cc3507575273beb165a5e027a081913ed507 upstream. With the lazy FPU code gone, we no longer use the counter field in struct fpu for anything. Get rid it. Signed-off-by: Rik van Riel Reviewed-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: pbonz...@redhat.com Link: http://lkml.kernel.org/r/1475627678-20788-6-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Daniel Sangorrin --- arch/x86/include/asm/fpu/internal.h | 3 --- arch/x86/include/asm/fpu/types.h| 11 --- arch/x86/kernel/fpu/core.c | 3 --- 3 files changed, 17 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 2ff03cc..16825dd 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -575,15 +575,12 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { - new_fpu->counter++; __fpregs_activate(new_fpu); prefetch(_fpu->state); } } else { - old_fpu->counter = 0; old_fpu->last_cpu = -1; if (fpu.preload) { - new_fpu->counter++; if (fpu_want_lazy_restore(new_fpu, cpu)) fpu.preload = 0; else diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 1c6f6ac..dcb85a9 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -303,17 +303,6 @@ struct fpu { unsigned char fpregs_active; /* -* @counter: -* -* This counter contains the number of consecutive context switches -* during which the FPU stays used. If this is over a threshold, the -* lazy FPU restore logic becomes eager, to save the trap overhead. -* This is an unsigned char so that after 256 iterations the counter -* wraps and the context switch behavior turns lazy again; this is to -* deal with bursty apps that only use the FPU for a short time: -*/ - unsigned char counter; - /* * @state: * * In-memory copy of all FPU registers that we save/restore diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b282364..b322325 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -250,7 +250,6 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->counter = 0; dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; @@ -353,7 +352,6 @@ void fpu__restore(struct fpu *fpu) kernel_fpu_disable(); fpregs_activate(fpu); copy_kernel_to_fpregs(>state); - fpu->counter++; kernel_fpu_enable(); } EXPORT_SYMBOL_GPL(fpu__restore); @@ -370,7 +368,6 @@ EXPORT_SYMBOL_GPL(fpu__restore); void fpu__drop(struct fpu *fpu) { preempt_disable(); - fpu->counter = 0; if (fpu->fpregs_active) { /* Ignore delayed exceptions from user space */ -- 2.7.4
[PATCH v4 4.4 3/4] x86/fpu: Remove struct fpu::counter
From: Rik van Riel commit 3913cc3507575273beb165a5e027a081913ed507 upstream. With the lazy FPU code gone, we no longer use the counter field in struct fpu for anything. Get rid it. Signed-off-by: Rik van Riel Reviewed-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: pbonz...@redhat.com Link: http://lkml.kernel.org/r/1475627678-20788-6-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Daniel Sangorrin --- arch/x86/include/asm/fpu/internal.h | 3 --- arch/x86/include/asm/fpu/types.h| 11 --- arch/x86/kernel/fpu/core.c | 3 --- 3 files changed, 17 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 2ff03cc..16825dd 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -575,15 +575,12 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { - new_fpu->counter++; __fpregs_activate(new_fpu); prefetch(_fpu->state); } } else { - old_fpu->counter = 0; old_fpu->last_cpu = -1; if (fpu.preload) { - new_fpu->counter++; if (fpu_want_lazy_restore(new_fpu, cpu)) fpu.preload = 0; else diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 1c6f6ac..dcb85a9 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -303,17 +303,6 @@ struct fpu { unsigned char fpregs_active; /* -* @counter: -* -* This counter contains the number of consecutive context switches -* during which the FPU stays used. If this is over a threshold, the -* lazy FPU restore logic becomes eager, to save the trap overhead. -* This is an unsigned char so that after 256 iterations the counter -* wraps and the context switch behavior turns lazy again; this is to -* deal with bursty apps that only use the FPU for a short time: -*/ - unsigned char counter; - /* * @state: * * In-memory copy of all FPU registers that we save/restore diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b282364..b322325 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -250,7 +250,6 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->counter = 0; dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; @@ -353,7 +352,6 @@ void fpu__restore(struct fpu *fpu) kernel_fpu_disable(); fpregs_activate(fpu); copy_kernel_to_fpregs(>state); - fpu->counter++; kernel_fpu_enable(); } EXPORT_SYMBOL_GPL(fpu__restore); @@ -370,7 +368,6 @@ EXPORT_SYMBOL_GPL(fpu__restore); void fpu__drop(struct fpu *fpu) { preempt_disable(); - fpu->counter = 0; if (fpu->fpregs_active) { /* Ignore delayed exceptions from user space */ -- 2.7.4
[PATCH v4 4.4 1/4] KVM: x86: remove eager_fpu field of struct kvm_vcpu_arch
From: Paolo Bonzini commit 5a5fbdc0e3f1159a734f1890da60fce70e98271d upstream. It is now equal to use_eager_fpu(), which simply tests a cpufeature bit. Signed-off-by: Paolo Bonzini Signed-off-by: Daniel Sangorrin --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/cpuid.c| 3 +-- arch/x86/kvm/x86.c | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74fda1a..3a37cdb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -439,7 +439,6 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_page_header_cache; struct fpu guest_fpu; - bool eager_fpu; u64 xcr0; u64 guest_supported_xcr0; u32 guest_xstate_size; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 83d6369..7b4ea5e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -104,8 +104,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - vcpu->arch.eager_fpu = use_eager_fpu(); - if (vcpu->arch.eager_fpu) + if (use_eager_fpu()) kvm_x86_ops->fpu_activate(vcpu); /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 53d43d2..660d8a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7325,7 +7325,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) * Every 255 times fpu_counter rolls over to 0; a guest that uses * the FPU in bursts will revert to loading it on demand. */ - if (!vcpu->arch.eager_fpu) { + if (!use_eager_fpu()) { if (++vcpu->fpu_counter < 5) kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); } -- 2.7.4
[PATCH v4 4.4 1/4] KVM: x86: remove eager_fpu field of struct kvm_vcpu_arch
From: Paolo Bonzini commit 5a5fbdc0e3f1159a734f1890da60fce70e98271d upstream. It is now equal to use_eager_fpu(), which simply tests a cpufeature bit. Signed-off-by: Paolo Bonzini Signed-off-by: Daniel Sangorrin --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/cpuid.c| 3 +-- arch/x86/kvm/x86.c | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74fda1a..3a37cdb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -439,7 +439,6 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_page_header_cache; struct fpu guest_fpu; - bool eager_fpu; u64 xcr0; u64 guest_supported_xcr0; u32 guest_xstate_size; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 83d6369..7b4ea5e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -104,8 +104,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - vcpu->arch.eager_fpu = use_eager_fpu(); - if (vcpu->arch.eager_fpu) + if (use_eager_fpu()) kvm_x86_ops->fpu_activate(vcpu); /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 53d43d2..660d8a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7325,7 +7325,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) * Every 255 times fpu_counter rolls over to 0; a guest that uses * the FPU in bursts will revert to loading it on demand. */ - if (!vcpu->arch.eager_fpu) { + if (!use_eager_fpu()) { if (++vcpu->fpu_counter < 5) kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); } -- 2.7.4
[PATCH v3 4.4.y 2/3] x86/fpu: Remove struct fpu::counter
From: Rik van Riel commit 3913cc3507575273beb165a5e027a081913ed507 upstream With the lazy FPU code gone, we no longer use the counter field in struct fpu for anything. Get rid it. Signed-off-by: Rik van Riel Reviewed-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: pbonz...@redhat.com Link: http://lkml.kernel.org/r/1475627678-20788-6-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Daniel Sangorrin --- arch/x86/include/asm/fpu/internal.h | 3 --- arch/x86/include/asm/fpu/types.h| 11 --- arch/x86/kernel/fpu/core.c | 3 --- 3 files changed, 17 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 7796e04..c8ba9b8 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -581,15 +581,12 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { - new_fpu->counter++; __fpregs_activate(new_fpu); prefetch(_fpu->state); } } else { - old_fpu->counter = 0; old_fpu->last_cpu = -1; if (fpu.preload) { - new_fpu->counter++; if (fpu_want_lazy_restore(new_fpu, cpu)) fpu.preload = 0; else diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 1c6f6ac..dcb85a9 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -303,17 +303,6 @@ struct fpu { unsigned char fpregs_active; /* -* @counter: -* -* This counter contains the number of consecutive context switches -* during which the FPU stays used. If this is over a threshold, the -* lazy FPU restore logic becomes eager, to save the trap overhead. -* This is an unsigned char so that after 256 iterations the counter -* wraps and the context switch behavior turns lazy again; this is to -* deal with bursty apps that only use the FPU for a short time: -*/ - unsigned char counter; - /* * @state: * * In-memory copy of all FPU registers that we save/restore diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b282364..b322325 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -250,7 +250,6 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->counter = 0; dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; @@ -353,7 +352,6 @@ void fpu__restore(struct fpu *fpu) kernel_fpu_disable(); fpregs_activate(fpu); copy_kernel_to_fpregs(>state); - fpu->counter++; kernel_fpu_enable(); } EXPORT_SYMBOL_GPL(fpu__restore); @@ -370,7 +368,6 @@ EXPORT_SYMBOL_GPL(fpu__restore); void fpu__drop(struct fpu *fpu) { preempt_disable(); - fpu->counter = 0; if (fpu->fpregs_active) { /* Ignore delayed exceptions from user space */ -- 2.1.4
[PATCH v3 4.4.y 2/3] x86/fpu: Remove struct fpu::counter
From: Rik van Riel commit 3913cc3507575273beb165a5e027a081913ed507 upstream With the lazy FPU code gone, we no longer use the counter field in struct fpu for anything. Get rid it. Signed-off-by: Rik van Riel Reviewed-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: pbonz...@redhat.com Link: http://lkml.kernel.org/r/1475627678-20788-6-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Daniel Sangorrin --- arch/x86/include/asm/fpu/internal.h | 3 --- arch/x86/include/asm/fpu/types.h| 11 --- arch/x86/kernel/fpu/core.c | 3 --- 3 files changed, 17 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 7796e04..c8ba9b8 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -581,15 +581,12 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { - new_fpu->counter++; __fpregs_activate(new_fpu); prefetch(_fpu->state); } } else { - old_fpu->counter = 0; old_fpu->last_cpu = -1; if (fpu.preload) { - new_fpu->counter++; if (fpu_want_lazy_restore(new_fpu, cpu)) fpu.preload = 0; else diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 1c6f6ac..dcb85a9 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -303,17 +303,6 @@ struct fpu { unsigned char fpregs_active; /* -* @counter: -* -* This counter contains the number of consecutive context switches -* during which the FPU stays used. If this is over a threshold, the -* lazy FPU restore logic becomes eager, to save the trap overhead. -* This is an unsigned char so that after 256 iterations the counter -* wraps and the context switch behavior turns lazy again; this is to -* deal with bursty apps that only use the FPU for a short time: -*/ - unsigned char counter; - /* * @state: * * In-memory copy of all FPU registers that we save/restore diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b282364..b322325 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -250,7 +250,6 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->counter = 0; dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; @@ -353,7 +352,6 @@ void fpu__restore(struct fpu *fpu) kernel_fpu_disable(); fpregs_activate(fpu); copy_kernel_to_fpregs(>state); - fpu->counter++; kernel_fpu_enable(); } EXPORT_SYMBOL_GPL(fpu__restore); @@ -370,7 +368,6 @@ EXPORT_SYMBOL_GPL(fpu__restore); void fpu__drop(struct fpu *fpu) { preempt_disable(); - fpu->counter = 0; if (fpu->fpregs_active) { /* Ignore delayed exceptions from user space */ -- 2.1.4
[PATCH v3 4.4.y 1/3] x86/fpu: Remove use_eager_fpu()
From: Andy Lutomirski commit c592b57347069abfc0dcad3b3a302cf882602597 upstream This removes all the obvious code paths that depend on lazy FPU mode. It shouldn't change the generated code at all. Signed-off-by: Andy Lutomirski Signed-off-by: Rik van Riel Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: pbonz...@redhat.com Link: http://lkml.kernel.org/r/1475627678-20788-5-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Daniel Sangorrin --- arch/x86/crypto/crc32c-intel_glue.c | 17 - arch/x86/include/asm/fpu/internal.h | 34 + arch/x86/kernel/fpu/core.c | 38 + arch/x86/kernel/fpu/signal.c| 8 +++- arch/x86/kvm/cpuid.c| 4 +--- arch/x86/kvm/x86.c | 10 -- 6 files changed, 14 insertions(+), 97 deletions(-) diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 15f5c76..d610c11 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -48,21 +48,13 @@ #ifdef CONFIG_X86_64 /* * use carryless multiply version of crc32c when buffer - * size is >= 512 (when eager fpu is enabled) or - * >= 1024 (when eager fpu is disabled) to account + * size is >= 512 to account * for fpu state save/restore overhead. */ -#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512 -#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU1024 +#define CRC32C_PCL_BREAKEVEN 512 asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, unsigned int crc_init); -static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU; -#define set_pcl_breakeven_point() \ -do { \ - if (!use_eager_fpu()) \ - crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \ -} while (0) #endif /* CONFIG_X86_64 */ static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) @@ -185,7 +177,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, * use faster PCL version if datasize is large enough to * overcome kernel fpu state save/restore overhead */ - if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { kernel_fpu_begin(); *crcp = crc_pcl(data, len, *crcp); kernel_fpu_end(); @@ -197,7 +189,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) { - if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { kernel_fpu_begin(); *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); kernel_fpu_end(); @@ -256,7 +248,6 @@ static int __init crc32c_intel_mod_init(void) alg.update = crc32c_pcl_intel_update; alg.finup = crc32c_pcl_intel_finup; alg.digest = crc32c_pcl_intel_digest; - set_pcl_breakeven_point(); } #endif return crypto_register_shash(); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 146d838..7796e04 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -56,11 +56,6 @@ extern u64 fpu__get_supported_xfeatures_mask(void); /* * FPU related CPU feature flag helper routines: */ -static __always_inline __pure bool use_eager_fpu(void) -{ - return true; -} - static __always_inline __pure bool use_xsaveopt(void) { return static_cpu_has_safe(X86_FEATURE_XSAVEOPT); @@ -504,24 +499,6 @@ static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) } -/* - * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation' - * idiom, which is then paired with the sw-flag (fpregs_active) later on: - */ - -static inline void __fpregs_activate_hw(void) -{ - if (!use_eager_fpu()) - clts(); -} - -static inline void __fpregs_deactivate_hw(void) -{ - if (!use_eager_fpu()) - stts(); -} - -/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */ static inline void __fpregs_deactivate(struct fpu *fpu) { WARN_ON_FPU(!fpu->fpregs_active); @@ -530,7 +507,6 @@ static inline void __fpregs_deactivate(struct fpu *fpu) this_cpu_write(fpu_fpregs_owner_ctx, NULL); } -/* Mu
[PATCH v3 4.4.y 1/3] x86/fpu: Remove use_eager_fpu()
From: Andy Lutomirski commit c592b57347069abfc0dcad3b3a302cf882602597 upstream This removes all the obvious code paths that depend on lazy FPU mode. It shouldn't change the generated code at all. Signed-off-by: Andy Lutomirski Signed-off-by: Rik van Riel Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: pbonz...@redhat.com Link: http://lkml.kernel.org/r/1475627678-20788-5-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Daniel Sangorrin --- arch/x86/crypto/crc32c-intel_glue.c | 17 - arch/x86/include/asm/fpu/internal.h | 34 + arch/x86/kernel/fpu/core.c | 38 + arch/x86/kernel/fpu/signal.c| 8 +++- arch/x86/kvm/cpuid.c| 4 +--- arch/x86/kvm/x86.c | 10 -- 6 files changed, 14 insertions(+), 97 deletions(-) diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 15f5c76..d610c11 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -48,21 +48,13 @@ #ifdef CONFIG_X86_64 /* * use carryless multiply version of crc32c when buffer - * size is >= 512 (when eager fpu is enabled) or - * >= 1024 (when eager fpu is disabled) to account + * size is >= 512 to account * for fpu state save/restore overhead. */ -#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512 -#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU1024 +#define CRC32C_PCL_BREAKEVEN 512 asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, unsigned int crc_init); -static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU; -#define set_pcl_breakeven_point() \ -do { \ - if (!use_eager_fpu()) \ - crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \ -} while (0) #endif /* CONFIG_X86_64 */ static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) @@ -185,7 +177,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, * use faster PCL version if datasize is large enough to * overcome kernel fpu state save/restore overhead */ - if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { kernel_fpu_begin(); *crcp = crc_pcl(data, len, *crcp); kernel_fpu_end(); @@ -197,7 +189,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) { - if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { kernel_fpu_begin(); *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); kernel_fpu_end(); @@ -256,7 +248,6 @@ static int __init crc32c_intel_mod_init(void) alg.update = crc32c_pcl_intel_update; alg.finup = crc32c_pcl_intel_finup; alg.digest = crc32c_pcl_intel_digest; - set_pcl_breakeven_point(); } #endif return crypto_register_shash(); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 146d838..7796e04 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -56,11 +56,6 @@ extern u64 fpu__get_supported_xfeatures_mask(void); /* * FPU related CPU feature flag helper routines: */ -static __always_inline __pure bool use_eager_fpu(void) -{ - return true; -} - static __always_inline __pure bool use_xsaveopt(void) { return static_cpu_has_safe(X86_FEATURE_XSAVEOPT); @@ -504,24 +499,6 @@ static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) } -/* - * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation' - * idiom, which is then paired with the sw-flag (fpregs_active) later on: - */ - -static inline void __fpregs_activate_hw(void) -{ - if (!use_eager_fpu()) - clts(); -} - -static inline void __fpregs_deactivate_hw(void) -{ - if (!use_eager_fpu()) - stts(); -} - -/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */ static inline void __fpregs_deactivate(struct fpu *fpu) { WARN_ON_FPU(!fpu->fpregs_active); @@ -530,7 +507,6 @@ static inline void __fpregs_deactivate(struct fpu *fpu) this_cpu_write(fpu_fpregs_owner_ctx, NULL); } -/* Mu
RE: [PATCH 4.4 00/24] 4.4.138-stable review
> -Original Message- > From: stable-ow...@vger.kernel.org On Behalf > Of Ben Hutchings [..] > 3.18 and 4.4 are still missing this important fix to early parameter > parsing: > > commit 02afeaae9843733a39cd9b11053748b2d1dc5ae7 > Author: Dave Hansen > Date: Tue Dec 22 14:52:38 2015 -0800 > > x86/boot: Fix early command-line parsing when matching at end I have cherry-picked that commit into both 3.18.y and 4.4.y (it applies cleanly) and tested them on my machine. Both worked correctly. Test method: - Added printks to the functions that detect noxsave, noxsaves and noxsaveopt - Booted both kernels with and without the commit, and adding the kernel parameter "noxsave" - Checked that "noxsaves" and "noxsaveopt" do not appear on dmesg anymore after the commit. Thanks, Daniel
RE: [PATCH 4.4 00/24] 4.4.138-stable review
> -Original Message- > From: stable-ow...@vger.kernel.org On Behalf > Of Ben Hutchings [..] > 3.18 and 4.4 are still missing this important fix to early parameter > parsing: > > commit 02afeaae9843733a39cd9b11053748b2d1dc5ae7 > Author: Dave Hansen > Date: Tue Dec 22 14:52:38 2015 -0800 > > x86/boot: Fix early command-line parsing when matching at end I have cherry-picked that commit into both 3.18.y and 4.4.y (it applies cleanly) and tested them on my machine. Both worked correctly. Test method: - Added printks to the functions that detect noxsave, noxsaves and noxsaveopt - Booted both kernels with and without the commit, and adding the kernel parameter "noxsave" - Checked that "noxsaves" and "noxsaveopt" do not appear on dmesg anymore after the commit. Thanks, Daniel
RE: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode
> -Original Message- > From: 'Greg Kroah-Hartman' [mailto:gre...@linuxfoundation.org] > Sent: Friday, June 15, 2018 4:06 PM > To: Daniel Sangorrin > Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; 'Andy Lutomirski' > ; 'Rik van Riel' ; 'Borislav Petkov' > ; 'Brian Gerst' ; 'Dave Hansen' > ; 'Denys Vlasenko' ; > 'Fenghua Yu' ; 'H. Peter Anvin' ; 'Josh > Poimboeuf' ; 'Linus Torvalds' > ; 'Oleg Nesterov' ; 'Peter > Zijlstra' ; 'Quentin Casasnovas' > ; 'Thomas Gleixner' ; > pbonz...@redhat.com; 'Ingo Molnar' > Subject: Re: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode > > On Fri, Jun 15, 2018 at 02:23:08PM +0900, Daniel Sangorrin wrote: > > > -Original Message- > > > From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] > On > > > Behalf Of 'Greg Kroah-Hartman' > > > Sent: Friday, June 15, 2018 1:56 PM > > > To: Daniel Sangorrin > > > Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; 'Andy > > > Lutomirski' > > > ; 'Rik van Riel' ; 'Borislav Petkov' > > > ; 'Brian Gerst' ; 'Dave Hansen' > > > ; 'Denys Vlasenko' ; > > > 'Fenghua Yu' ; 'H. Peter Anvin' ; > 'Josh > > > Poimboeuf' ; 'Linus Torvalds' > > > ; 'Oleg Nesterov' ; 'Peter > > > Zijlstra' ; 'Quentin Casasnovas' > > > ; 'Thomas Gleixner' ; > > > pbonz...@redhat.com; 'Ingo Molnar' > > > Subject: Re: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode > > > > > > On Fri, Jun 15, 2018 at 01:24:27PM +0900, Daniel Sangorrin wrote: > > > > Hi Greg, > > > > > > > > > /* Intel-defined CPU features, CPUID level 0x0001 (ecx), word 4 > > > > > */ > > > > > --- a/arch/x86/include/asm/fpu/internal.h > > > > > +++ b/arch/x86/include/asm/fpu/internal.h > > > > > @@ -58,7 +58,7 @@ extern u64 fpu__get_supported_xfeatures_ > > > > > */ > > > > > static __always_inline __pure bool use_eager_fpu(void) > > > > > { > > > > > - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); > > > > > + return true; > > > > > } > > > > > > > > Since this function returns always true then we can remove the code > depending on > > > lazy FPU mode. > > > > Actually this has already been done in "x86/fpu: Remove use_eager_fpu()" > > > > Ref: https://patchwork.kernel.org/patch/9365883/ > > > > > > > > > static void __init fpu__init_parse_early_param(void) > > > > > { > > > > > - if (cmdline_find_option_bool(boot_command_line, > "eagerfpu=off")) { > > > > > - eagerfpu = DISABLE; > > > > > - fpu__clear_eager_fpu_features(); > > > > > - } > > > > > > > > Since this patch removes the kernel boot parameter "eagerfpu", maybe we > should > > > remove it from the Documentation. > > > > This has also been done by commit "x86/fpu: Finish excising 'eagerfpu'" > > > > Ref: https://patchwork.kernel.org/patch/9380673/ > > > > > > > > I will try backporting those patches unless anyone has any objections. > > > > > > What are the git commit ids of those patches in Linus's tree? No need > > > to point to patchwork links, I don't use that tool. > > > > OK, I got it. > > > > "x86/fpu: Remove use_eager_fpu()": > c592b57347069abfc0dcad3b3a302cf882602597 > > "x86/fpu: Finish excising 'eagerfpu'": > e63650840e8b053aa09ad934877e87e9941ed135 > > Minor nit. For kernel commits, the "normal" way we reference them looks > like this: > c592b5734706 ("x86/fpu: Remove use_eager_fpu()") > e63650840e8b ("x86/fpu: Finish excising 'eagerfpu'") > > Which makes it much easier to read and understand. That's what we use > for the "Fixes:" tag in commits and in other places (text in commit > messages.) > > To automatically generate that format, you can just do: > git show -s --abbrev-commit --abbrev=12 --pretty=format:"%h > (\"%s\")%n" > > I recommend just setting up an alias for the above line, otherwise it's > a pain to have to remember how to do it all the time. Here's what I do: > $ alias gsr='git show -s --abbrev-commit --abbrev=12 --pretty=format:"%h > (\"%s\")%n"' > $ gsr c592b57347069abfc0dcad3b3a302cf882602597 > c592b5734706 ("x86/fpu: Remove use_eager_fpu()") > Thanks a lot for the detailed explanation. > > Unfortunately, they don't apply cleanly to stable kernels. > > Should be very simple to backport if you want to. Also I need copies > for the 4.9.y tree as well if you do so. OK, I will do it after the OSS Japan next week if you don't mind. I also want to run some tests for the FPU and try different boot parameter combinations (e.g. no387, nofxsr etc) to check I didn't break anything. Thanks, Daniel > > thanks, > > greg k-h
RE: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode
> -Original Message- > From: 'Greg Kroah-Hartman' [mailto:gre...@linuxfoundation.org] > Sent: Friday, June 15, 2018 4:06 PM > To: Daniel Sangorrin > Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; 'Andy Lutomirski' > ; 'Rik van Riel' ; 'Borislav Petkov' > ; 'Brian Gerst' ; 'Dave Hansen' > ; 'Denys Vlasenko' ; > 'Fenghua Yu' ; 'H. Peter Anvin' ; 'Josh > Poimboeuf' ; 'Linus Torvalds' > ; 'Oleg Nesterov' ; 'Peter > Zijlstra' ; 'Quentin Casasnovas' > ; 'Thomas Gleixner' ; > pbonz...@redhat.com; 'Ingo Molnar' > Subject: Re: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode > > On Fri, Jun 15, 2018 at 02:23:08PM +0900, Daniel Sangorrin wrote: > > > -Original Message- > > > From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] > On > > > Behalf Of 'Greg Kroah-Hartman' > > > Sent: Friday, June 15, 2018 1:56 PM > > > To: Daniel Sangorrin > > > Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; 'Andy > > > Lutomirski' > > > ; 'Rik van Riel' ; 'Borislav Petkov' > > > ; 'Brian Gerst' ; 'Dave Hansen' > > > ; 'Denys Vlasenko' ; > > > 'Fenghua Yu' ; 'H. Peter Anvin' ; > 'Josh > > > Poimboeuf' ; 'Linus Torvalds' > > > ; 'Oleg Nesterov' ; 'Peter > > > Zijlstra' ; 'Quentin Casasnovas' > > > ; 'Thomas Gleixner' ; > > > pbonz...@redhat.com; 'Ingo Molnar' > > > Subject: Re: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode > > > > > > On Fri, Jun 15, 2018 at 01:24:27PM +0900, Daniel Sangorrin wrote: > > > > Hi Greg, > > > > > > > > > /* Intel-defined CPU features, CPUID level 0x0001 (ecx), word 4 > > > > > */ > > > > > --- a/arch/x86/include/asm/fpu/internal.h > > > > > +++ b/arch/x86/include/asm/fpu/internal.h > > > > > @@ -58,7 +58,7 @@ extern u64 fpu__get_supported_xfeatures_ > > > > > */ > > > > > static __always_inline __pure bool use_eager_fpu(void) > > > > > { > > > > > - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); > > > > > + return true; > > > > > } > > > > > > > > Since this function returns always true then we can remove the code > depending on > > > lazy FPU mode. > > > > Actually this has already been done in "x86/fpu: Remove use_eager_fpu()" > > > > Ref: https://patchwork.kernel.org/patch/9365883/ > > > > > > > > > static void __init fpu__init_parse_early_param(void) > > > > > { > > > > > - if (cmdline_find_option_bool(boot_command_line, > "eagerfpu=off")) { > > > > > - eagerfpu = DISABLE; > > > > > - fpu__clear_eager_fpu_features(); > > > > > - } > > > > > > > > Since this patch removes the kernel boot parameter "eagerfpu", maybe we > should > > > remove it from the Documentation. > > > > This has also been done by commit "x86/fpu: Finish excising 'eagerfpu'" > > > > Ref: https://patchwork.kernel.org/patch/9380673/ > > > > > > > > I will try backporting those patches unless anyone has any objections. > > > > > > What are the git commit ids of those patches in Linus's tree? No need > > > to point to patchwork links, I don't use that tool. > > > > OK, I got it. > > > > "x86/fpu: Remove use_eager_fpu()": > c592b57347069abfc0dcad3b3a302cf882602597 > > "x86/fpu: Finish excising 'eagerfpu'": > e63650840e8b053aa09ad934877e87e9941ed135 > > Minor nit. For kernel commits, the "normal" way we reference them looks > like this: > c592b5734706 ("x86/fpu: Remove use_eager_fpu()") > e63650840e8b ("x86/fpu: Finish excising 'eagerfpu'") > > Which makes it much easier to read and understand. That's what we use > for the "Fixes:" tag in commits and in other places (text in commit > messages.) > > To automatically generate that format, you can just do: > git show -s --abbrev-commit --abbrev=12 --pretty=format:"%h > (\"%s\")%n" > > I recommend just setting up an alias for the above line, otherwise it's > a pain to have to remember how to do it all the time. Here's what I do: > $ alias gsr='git show -s --abbrev-commit --abbrev=12 --pretty=format:"%h > (\"%s\")%n"' > $ gsr c592b57347069abfc0dcad3b3a302cf882602597 > c592b5734706 ("x86/fpu: Remove use_eager_fpu()") > Thanks a lot for the detailed explanation. > > Unfortunately, they don't apply cleanly to stable kernels. > > Should be very simple to backport if you want to. Also I need copies > for the 4.9.y tree as well if you do so. OK, I will do it after the OSS Japan next week if you don't mind. I also want to run some tests for the FPU and try different boot parameter combinations (e.g. no387, nofxsr etc) to check I didn't break anything. Thanks, Daniel > > thanks, > > greg k-h
RE: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode
> -Original Message- > From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] On > Behalf Of 'Greg Kroah-Hartman' > Sent: Friday, June 15, 2018 1:56 PM > To: Daniel Sangorrin > Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; 'Andy Lutomirski' > ; 'Rik van Riel' ; 'Borislav Petkov' > ; 'Brian Gerst' ; 'Dave Hansen' > ; 'Denys Vlasenko' ; > 'Fenghua Yu' ; 'H. Peter Anvin' ; 'Josh > Poimboeuf' ; 'Linus Torvalds' > ; 'Oleg Nesterov' ; 'Peter > Zijlstra' ; 'Quentin Casasnovas' > ; 'Thomas Gleixner' ; > pbonz...@redhat.com; 'Ingo Molnar' > Subject: Re: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode > > On Fri, Jun 15, 2018 at 01:24:27PM +0900, Daniel Sangorrin wrote: > > Hi Greg, > > > > > /* Intel-defined CPU features, CPUID level 0x0001 (ecx), word 4 */ > > > --- a/arch/x86/include/asm/fpu/internal.h > > > +++ b/arch/x86/include/asm/fpu/internal.h > > > @@ -58,7 +58,7 @@ extern u64 fpu__get_supported_xfeatures_ > > > */ > > > static __always_inline __pure bool use_eager_fpu(void) > > > { > > > - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); > > > + return true; > > > } > > > > Since this function returns always true then we can remove the code > > depending on > lazy FPU mode. > > Actually this has already been done in "x86/fpu: Remove use_eager_fpu()" > > Ref: https://patchwork.kernel.org/patch/9365883/ > > > > > static void __init fpu__init_parse_early_param(void) > > > { > > > - if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { > > > - eagerfpu = DISABLE; > > > - fpu__clear_eager_fpu_features(); > > > - } > > > > Since this patch removes the kernel boot parameter "eagerfpu", maybe we > > should > remove it from the Documentation. > > This has also been done by commit "x86/fpu: Finish excising 'eagerfpu'" > > Ref: https://patchwork.kernel.org/patch/9380673/ > > > > I will try backporting those patches unless anyone has any objections. > > What are the git commit ids of those patches in Linus's tree? No need > to point to patchwork links, I don't use that tool. OK, I got it. "x86/fpu: Remove use_eager_fpu()": c592b57347069abfc0dcad3b3a302cf882602597 "x86/fpu: Finish excising 'eagerfpu'": e63650840e8b053aa09ad934877e87e9941ed135 Unfortunately, they don't apply cleanly to stable kernels. Thanks, Daniel Sangorrin
RE: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode
> -Original Message- > From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] On > Behalf Of 'Greg Kroah-Hartman' > Sent: Friday, June 15, 2018 1:56 PM > To: Daniel Sangorrin > Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; 'Andy Lutomirski' > ; 'Rik van Riel' ; 'Borislav Petkov' > ; 'Brian Gerst' ; 'Dave Hansen' > ; 'Denys Vlasenko' ; > 'Fenghua Yu' ; 'H. Peter Anvin' ; 'Josh > Poimboeuf' ; 'Linus Torvalds' > ; 'Oleg Nesterov' ; 'Peter > Zijlstra' ; 'Quentin Casasnovas' > ; 'Thomas Gleixner' ; > pbonz...@redhat.com; 'Ingo Molnar' > Subject: Re: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode > > On Fri, Jun 15, 2018 at 01:24:27PM +0900, Daniel Sangorrin wrote: > > Hi Greg, > > > > > /* Intel-defined CPU features, CPUID level 0x0001 (ecx), word 4 */ > > > --- a/arch/x86/include/asm/fpu/internal.h > > > +++ b/arch/x86/include/asm/fpu/internal.h > > > @@ -58,7 +58,7 @@ extern u64 fpu__get_supported_xfeatures_ > > > */ > > > static __always_inline __pure bool use_eager_fpu(void) > > > { > > > - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); > > > + return true; > > > } > > > > Since this function returns always true then we can remove the code > > depending on > lazy FPU mode. > > Actually this has already been done in "x86/fpu: Remove use_eager_fpu()" > > Ref: https://patchwork.kernel.org/patch/9365883/ > > > > > static void __init fpu__init_parse_early_param(void) > > > { > > > - if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { > > > - eagerfpu = DISABLE; > > > - fpu__clear_eager_fpu_features(); > > > - } > > > > Since this patch removes the kernel boot parameter "eagerfpu", maybe we > > should > remove it from the Documentation. > > This has also been done by commit "x86/fpu: Finish excising 'eagerfpu'" > > Ref: https://patchwork.kernel.org/patch/9380673/ > > > > I will try backporting those patches unless anyone has any objections. > > What are the git commit ids of those patches in Linus's tree? No need > to point to patchwork links, I don't use that tool. OK, I got it. "x86/fpu: Remove use_eager_fpu()": c592b57347069abfc0dcad3b3a302cf882602597 "x86/fpu: Finish excising 'eagerfpu'": e63650840e8b053aa09ad934877e87e9941ed135 Unfortunately, they don't apply cleanly to stable kernels. Thanks, Daniel Sangorrin
RE: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode
Hi Greg, > /* Intel-defined CPU features, CPUID level 0x0001 (ecx), word 4 */ > --- a/arch/x86/include/asm/fpu/internal.h > +++ b/arch/x86/include/asm/fpu/internal.h > @@ -58,7 +58,7 @@ extern u64 fpu__get_supported_xfeatures_ > */ > static __always_inline __pure bool use_eager_fpu(void) > { > - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); > + return true; > } Since this function returns always true then we can remove the code depending on lazy FPU mode. Actually this has already been done in "x86/fpu: Remove use_eager_fpu()" Ref: https://patchwork.kernel.org/patch/9365883/ > static void __init fpu__init_parse_early_param(void) > { > - if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { > - eagerfpu = DISABLE; > - fpu__clear_eager_fpu_features(); > - } Since this patch removes the kernel boot parameter "eagerfpu", maybe we should remove it from the Documentation. This has also been done by commit "x86/fpu: Finish excising 'eagerfpu'" Ref: https://patchwork.kernel.org/patch/9380673/ I will try backporting those patches unless anyone has any objections. Thanks, Daniel Sangorrin
RE: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode
Hi Greg, > /* Intel-defined CPU features, CPUID level 0x0001 (ecx), word 4 */ > --- a/arch/x86/include/asm/fpu/internal.h > +++ b/arch/x86/include/asm/fpu/internal.h > @@ -58,7 +58,7 @@ extern u64 fpu__get_supported_xfeatures_ > */ > static __always_inline __pure bool use_eager_fpu(void) > { > - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); > + return true; > } Since this function returns always true then we can remove the code depending on lazy FPU mode. Actually this has already been done in "x86/fpu: Remove use_eager_fpu()" Ref: https://patchwork.kernel.org/patch/9365883/ > static void __init fpu__init_parse_early_param(void) > { > - if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { > - eagerfpu = DISABLE; > - fpu__clear_eager_fpu_features(); > - } Since this patch removes the kernel boot parameter "eagerfpu", maybe we should remove it from the Documentation. This has also been done by commit "x86/fpu: Finish excising 'eagerfpu'" Ref: https://patchwork.kernel.org/patch/9380673/ I will try backporting those patches unless anyone has any objections. Thanks, Daniel Sangorrin
RE: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page protection"
> -Original Message- > From: Naresh Kamboju [mailto:naresh.kamb...@linaro.org] > Sent: Friday, June 1, 2018 12:55 AM > To: Daniel Sangorrin > Cc: Greg Kroah-Hartman ; open list > ; linux- stable ; > Davidlohr Bueso ; Joe Lawrence ; > Andrea Arcangeli ; Manfred Spraul > ; Andrew Morton ; > Linus Torvalds > Subject: Re: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page > protection" > > On 31 May 2018 at 13:06, Daniel Sangorrin > wrote: > >> -Original Message- > >> From: 'Greg Kroah-Hartman' [mailto:gre...@linuxfoundation.org] > > .. > >> Thanks for letting us know, but this was reported already. See the > >> emails on lkml with the subject: > >> Subject: Re: [PATCH 4.16 000/272] 4.16.13-stable review > >> from Davidlohr Bueso > >> Message-ID: <20180528213039.yy2madue67njkmw5@linux-n805> > >> > >> where he discusses that the LTP test is incorrect and that the kernel > >> change is correct and that LTP is going to be fixed because of this. > > My two cents, > If you are referring to cve-2017-5669.c > LTP test case is been fixed few hours ago by Rafael Tinoco, > > - shm_addr = shmat(shm_id, ((void *)1), SHM_RND); > + shm_addr = shmat(shm_id, ((void *)1), SHM_RND | SHM_REMAP); > > LTP patch pull request and it is been merged. > https://github.com/linux-test-project/ltp/pull/324 Thanks a lot Naresh. I confirmed that the latest LTP cve-2017-5669 now PASSes. Thanks, Daniel
RE: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page protection"
> -Original Message- > From: Naresh Kamboju [mailto:naresh.kamb...@linaro.org] > Sent: Friday, June 1, 2018 12:55 AM > To: Daniel Sangorrin > Cc: Greg Kroah-Hartman ; open list > ; linux- stable ; > Davidlohr Bueso ; Joe Lawrence ; > Andrea Arcangeli ; Manfred Spraul > ; Andrew Morton ; > Linus Torvalds > Subject: Re: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page > protection" > > On 31 May 2018 at 13:06, Daniel Sangorrin > wrote: > >> -Original Message- > >> From: 'Greg Kroah-Hartman' [mailto:gre...@linuxfoundation.org] > > .. > >> Thanks for letting us know, but this was reported already. See the > >> emails on lkml with the subject: > >> Subject: Re: [PATCH 4.16 000/272] 4.16.13-stable review > >> from Davidlohr Bueso > >> Message-ID: <20180528213039.yy2madue67njkmw5@linux-n805> > >> > >> where he discusses that the LTP test is incorrect and that the kernel > >> change is correct and that LTP is going to be fixed because of this. > > My two cents, > If you are referring to cve-2017-5669.c > LTP test case is been fixed few hours ago by Rafael Tinoco, > > - shm_addr = shmat(shm_id, ((void *)1), SHM_RND); > + shm_addr = shmat(shm_id, ((void *)1), SHM_RND | SHM_REMAP); > > LTP patch pull request and it is been merged. > https://github.com/linux-test-project/ltp/pull/324 Thanks a lot Naresh. I confirmed that the latest LTP cve-2017-5669 now PASSes. Thanks, Daniel
RE: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page protection"
> -Original Message- > From: 'Greg Kroah-Hartman' [mailto:gre...@linuxfoundation.org] .. > Thanks for letting us know, but this was reported already. See the > emails on lkml with the subject: > Subject: Re: [PATCH 4.16 000/272] 4.16.13-stable review > from Davidlohr Bueso > Message-ID: <20180528213039.yy2madue67njkmw5@linux-n805> > > where he discusses that the LTP test is incorrect and that the kernel > change is correct and that LTP is going to be fixed because of this. OK, thank you for the information! Regards, Daniel
RE: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page protection"
> -Original Message- > From: 'Greg Kroah-Hartman' [mailto:gre...@linuxfoundation.org] .. > Thanks for letting us know, but this was reported already. See the > emails on lkml with the subject: > Subject: Re: [PATCH 4.16 000/272] 4.16.13-stable review > from Davidlohr Bueso > Message-ID: <20180528213039.yy2madue67njkmw5@linux-n805> > > where he discusses that the LTP test is incorrect and that the kernel > change is correct and that LTP is going to be fixed because of this. OK, thank you for the information! Regards, Daniel
RE: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page protection"
> -Original Message- > From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] On > 4.4-stable review patch. If anyone has any objections, please let me know. > > -- > > From: Davidlohr Bueso > > commit a73ab244f0dad8fffb3291b905f73e2d3eaa7c00 upstream. > > Patch series "ipc/shm: shmat() fixes around nil-page". Sorry for being a bit late (the pace is really fast here). I have found a regression from 4.4.133-rc1 to 4.4.134-rc1 using Fuego LTP wrapper. 4.4.134-rc1 tst_test.c:982: INFO: Timeout per run is 0h 05m 00s cve-2017-5669.c:62: INFO: Attempting to attach shared memory to null page cve-2017-5669.c:74: INFO: Mapped shared memory to (nil) cve-2017-5669.c:78: FAIL: We have mapped a VM address within the first 64Kb cve-2017-5669.c:84: INFO: Touching shared memory to see if anything strange happens 4.4.133-rc1: tst_test.c:982: INFO: Timeout per run is 0h 05m 00s cve-2017-5669.c:62: INFO: Attempting to attach shared memory to null page cve-2017-5669.c:67: PASS: shmat returned EINVAL The culprits should be one or both of the two last commits to ipc/shm (one of them a revert). - ipc/shm: fix shmat() nil address after round-down when remapping - Revert "ipc/shm: Fix shmat mmap nil-page protection" I need to investigate the concrete reason, but for now I just wanted to report it. Thanks, Daniel > > These patches fix two issues reported[1] a while back by Joe and Andrea > around how shmat(2) behaves with nil-page. > > The first reverts a commit that it was incorrectly thought that mapping > nil-page (address=0) was a no no with MAP_FIXED. This is not the case, > with the exception of SHM_REMAP; which is address in the second patch. > > I chose two patches because it is easier to backport and it explicitly > reverts bogus behaviour. Both patches ought to be in -stable and ltp > testcases need updated (the added testcase around the cve can be > modified to just test for SHM_RND|SHM_REMAP). > > [1] lkml.kernel.org/r/20180430172152.nfa564pvgpk3ut7p@linux-n805 > > This patch (of 2): > > Commit 95e91b831f87 ("ipc/shm: Fix shmat mmap nil-page protection") > worked on the idea that we should not be mapping as root addr=0 and > MAP_FIXED. However, it was reported that this scenario is in fact > valid, thus making the patch both bogus and breaks userspace as well. > > For example X11's libint10.so relies on shmat(1, SHM_RND) for lowmem > initialization[1]. > > [1] > https://cgit.freedesktop.org/xorg/xserver/tree/hw/xfree86/os-support/linux/int1 > 0/linux.c#n347 > Link: http://lkml.kernel.org/r/20180503203243.15045-2-d...@stgolabs.net > Fixes: 95e91b831f87 ("ipc/shm: Fix shmat mmap nil-page protection") > Signed-off-by: Davidlohr Bueso > Reported-by: Joe Lawrence > Reported-by: Andrea Arcangeli > Cc: Manfred Spraul > Cc: > Signed-off-by: Andrew Morton > Signed-off-by: Linus Torvalds > Signed-off-by: Greg Kroah-Hartman > > --- > ipc/shm.c |9 ++--- > 1 file changed, 2 insertions(+), 7 deletions(-) > > --- a/ipc/shm.c > +++ b/ipc/shm.c > @@ -1113,13 +1113,8 @@ long do_shmat(int shmid, char __user *sh > goto out; > else if ((addr = (ulong)shmaddr)) { > if (addr & (shmlba - 1)) { > - /* > - * Round down to the nearest multiple of shmlba. > - * For sane do_mmap_pgoff() parameters, avoid > - * round downs that trigger nil-page and MAP_FIXED. > - */ > - if ((shmflg & SHM_RND) && addr >= shmlba) > - addr &= ~(shmlba - 1); > + if (shmflg & SHM_RND) > + addr &= ~(shmlba - 1); /* round down */ > else > #ifndef __ARCH_FORCE_SHMLBA > if (addr & ~PAGE_MASK) > >
RE: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page protection"
> -Original Message- > From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] On > 4.4-stable review patch. If anyone has any objections, please let me know. > > -- > > From: Davidlohr Bueso > > commit a73ab244f0dad8fffb3291b905f73e2d3eaa7c00 upstream. > > Patch series "ipc/shm: shmat() fixes around nil-page". Sorry for being a bit late (the pace is really fast here). I have found a regression from 4.4.133-rc1 to 4.4.134-rc1 using Fuego LTP wrapper. 4.4.134-rc1 tst_test.c:982: INFO: Timeout per run is 0h 05m 00s cve-2017-5669.c:62: INFO: Attempting to attach shared memory to null page cve-2017-5669.c:74: INFO: Mapped shared memory to (nil) cve-2017-5669.c:78: FAIL: We have mapped a VM address within the first 64Kb cve-2017-5669.c:84: INFO: Touching shared memory to see if anything strange happens 4.4.133-rc1: tst_test.c:982: INFO: Timeout per run is 0h 05m 00s cve-2017-5669.c:62: INFO: Attempting to attach shared memory to null page cve-2017-5669.c:67: PASS: shmat returned EINVAL The culprits should be one or both of the two last commits to ipc/shm (one of them a revert). - ipc/shm: fix shmat() nil address after round-down when remapping - Revert "ipc/shm: Fix shmat mmap nil-page protection" I need to investigate the concrete reason, but for now I just wanted to report it. Thanks, Daniel > > These patches fix two issues reported[1] a while back by Joe and Andrea > around how shmat(2) behaves with nil-page. > > The first reverts a commit that it was incorrectly thought that mapping > nil-page (address=0) was a no no with MAP_FIXED. This is not the case, > with the exception of SHM_REMAP; which is address in the second patch. > > I chose two patches because it is easier to backport and it explicitly > reverts bogus behaviour. Both patches ought to be in -stable and ltp > testcases need updated (the added testcase around the cve can be > modified to just test for SHM_RND|SHM_REMAP). > > [1] lkml.kernel.org/r/20180430172152.nfa564pvgpk3ut7p@linux-n805 > > This patch (of 2): > > Commit 95e91b831f87 ("ipc/shm: Fix shmat mmap nil-page protection") > worked on the idea that we should not be mapping as root addr=0 and > MAP_FIXED. However, it was reported that this scenario is in fact > valid, thus making the patch both bogus and breaks userspace as well. > > For example X11's libint10.so relies on shmat(1, SHM_RND) for lowmem > initialization[1]. > > [1] > https://cgit.freedesktop.org/xorg/xserver/tree/hw/xfree86/os-support/linux/int1 > 0/linux.c#n347 > Link: http://lkml.kernel.org/r/20180503203243.15045-2-d...@stgolabs.net > Fixes: 95e91b831f87 ("ipc/shm: Fix shmat mmap nil-page protection") > Signed-off-by: Davidlohr Bueso > Reported-by: Joe Lawrence > Reported-by: Andrea Arcangeli > Cc: Manfred Spraul > Cc: > Signed-off-by: Andrew Morton > Signed-off-by: Linus Torvalds > Signed-off-by: Greg Kroah-Hartman > > --- > ipc/shm.c |9 ++--- > 1 file changed, 2 insertions(+), 7 deletions(-) > > --- a/ipc/shm.c > +++ b/ipc/shm.c > @@ -1113,13 +1113,8 @@ long do_shmat(int shmid, char __user *sh > goto out; > else if ((addr = (ulong)shmaddr)) { > if (addr & (shmlba - 1)) { > - /* > - * Round down to the nearest multiple of shmlba. > - * For sane do_mmap_pgoff() parameters, avoid > - * round downs that trigger nil-page and MAP_FIXED. > - */ > - if ((shmflg & SHM_RND) && addr >= shmlba) > - addr &= ~(shmlba - 1); > + if (shmflg & SHM_RND) > + addr &= ~(shmlba - 1); /* round down */ > else > #ifndef __ARCH_FORCE_SHMLBA > if (addr & ~PAGE_MASK) > >
RE: [PATCH 4.4 00/92] 4.4.133-stable review
> -Original Message- > From: Rafael Tinoco [mailto:rafael.tin...@linaro.org] > > Thank you Daniel! Will investigate those. OK, thank you :). Notice that I did those tests on x86_64. It seems you are testing on arm, so there may be some differences. I just checked these tests on 4.4.133 (on x86_64): fcntl35: PASS fcntl35_64: PASS select04: PASS I am currently investigating other tests that are failing as well. They are not regressions, just some patches have not been backported yet. Thanks, Daniel > > Meanwhile, Greg, I referred to: > > time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting > > Since we're not using this type of clock on arm64's 4.4 kernel vdso > functions. This commit's description calls attention for it to be > responsible for fixing kselftest flacking tests, we wouldn't get that > on 4.4 according to bellow: > > stable-rc-linux-4.14.y > dbb236c1ceb6 arm64/vdso: Fix nsec handling for CLOCK_MONOTONIC_RAW > 49eea433b326 arm64: Add support for CLOCK_MONOTONIC_RAW in > clock_gettime() vDSO > 82e88ff1ea94 hrtimer: Revert CLOCK_MONOTONIC_RAW support > 9c808765e88e hrtimer: Add support for CLOCK_MONOTONIC_RAW > > stable-rc-linux-4.16.y > dbb236c1ceb6 arm64/vdso: Fix nsec handling for CLOCK_MONOTONIC_RAW > 49eea433b326 arm64: Add support for CLOCK_MONOTONIC_RAW in > clock_gettime() vDSO > 82e88ff1ea94 hrtimer: Revert CLOCK_MONOTONIC_RAW support > 9c808765e88e hrtimer: Add support for CLOCK_MONOTONIC_RAW > > stable-rc-linux-4.4.y > > > stable-rc-linux-4.9.y > 99f66b5182a4 arm64/vdso: Fix nsec handling for CLOCK_MONOTONIC_RAW > 49eea433b326 arm64: Add support for CLOCK_MONOTONIC_RAW in > clock_gettime() vDSO > 82e88ff1ea94 hrtimer: Revert CLOCK_MONOTONIC_RAW support > 9c808765e88e hrtimer: Add support for CLOCK_MONOTONIC_RAW > > Yet, the second fix was backported to all (including 4.4.y): > > stable-rc-linux-4.14.y > 3d88d56c5873 time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting > stable-rc-linux-4.16.y > 3d88d56c5873 time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting > stable-rc-linux-4.4.y > 7c8bd6e07430 time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting > stable-rc-linux-4.9.y > a53bfdda06ac time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting > > Not sure you want to keep it in 4.4, thought it was worth mentioning it. > > Cheers. > > On 24 May 2018 at 22:34, Daniel Sangorrin > <daniel.sangor...@toshiba.co.jp> wrote: > > Hello Rafael, > > > > The tests fcntl35 and fcntl35_64 should have go from FAIL to PASS. > > https://www.spinics.net/lists/stable/msg239475.html > > > > Looking at > > > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a > 3d7cdea9/testrun/228569/suite/ltp-syscalls-tests/tests/ > > I see that these two tests (and other important tests as well) are being > > SKIPPED. > > > > By the way, I see that select04 FAILS in your case. But in my setup, > > select04 was > working fine (x86_64) in 4.4.132. I will confirm that it still works in > 4.4.133 > > > > Thanks, > > Daniel Sangorrin > > > >> -Original Message- > >> From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] > On > >> Behalf Of Rafael Tinoco > >> Sent: Friday, May 25, 2018 5:32 AM > >> To: Greg Kroah-Hartman <gre...@linuxfoundation.org> > >> Cc: linux-kernel@vger.kernel.org; sh...@kernel.org; patc...@kernelci.org; > >> lkft-tri...@lists.linaro.org; ben.hutchi...@codethink.co.uk; > >> sta...@vger.kernel.org; a...@linux-foundation.org; > >> torva...@linux-foundation.org; li...@roeck-us.net > >> Subject: Re: [PATCH 4.4 00/92] 4.4.133-stable review > >> > >> > > kernel: 4.4.133-rc1 > >> > > git repo: > >> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > >> > > git branch: linux-4.4.y > >> > > git commit: 915a3d7cdea9daa9e9fb6b855f10c1312e6910c4 > >> > > git describe: v4.4.132-93-g915a3d7cdea9 > >> > > Test details: > >> > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a > >> 3d7cdea9 > >> > > > >> > > > >> > > No regressions (compared to build v4.4.132-71-g180635995c36) > >> > > >> > It should have gotten better, as there was a fix in here for at least 2 > >> > LTP tests that we previously were not passing. I don't know why you all > >> > were not reporting that in the past, it took someone else randomly > >> > deciding to run LTP to report it to me... > >> > > >> > Why did an improvement in results not show up? > >> > >> Are you referring to the CLOCK_MONOTONIC_RAW fix for the arm64 vDSO ? > >> I think that CLOCK_MONOTONIC_RAW in VDSO wasn't backported to 4.4.y > >> (commit 49eea433b326 in mainline) so this "fix" is changing the > >> timekeeping sauce (that would fix MONOTONIC RAW) but not for 4.4.y in > >> ARM64. Needs review though :\ > > > > > >
RE: [PATCH 4.4 00/92] 4.4.133-stable review
> -Original Message- > From: Rafael Tinoco [mailto:rafael.tin...@linaro.org] > > Thank you Daniel! Will investigate those. OK, thank you :). Notice that I did those tests on x86_64. It seems you are testing on arm, so there may be some differences. I just checked these tests on 4.4.133 (on x86_64): fcntl35: PASS fcntl35_64: PASS select04: PASS I am currently investigating other tests that are failing as well. They are not regressions, just some patches have not been backported yet. Thanks, Daniel > > Meanwhile, Greg, I referred to: > > time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting > > Since we're not using this type of clock on arm64's 4.4 kernel vdso > functions. This commit's description calls attention for it to be > responsible for fixing kselftest flacking tests, we wouldn't get that > on 4.4 according to bellow: > > stable-rc-linux-4.14.y > dbb236c1ceb6 arm64/vdso: Fix nsec handling for CLOCK_MONOTONIC_RAW > 49eea433b326 arm64: Add support for CLOCK_MONOTONIC_RAW in > clock_gettime() vDSO > 82e88ff1ea94 hrtimer: Revert CLOCK_MONOTONIC_RAW support > 9c808765e88e hrtimer: Add support for CLOCK_MONOTONIC_RAW > > stable-rc-linux-4.16.y > dbb236c1ceb6 arm64/vdso: Fix nsec handling for CLOCK_MONOTONIC_RAW > 49eea433b326 arm64: Add support for CLOCK_MONOTONIC_RAW in > clock_gettime() vDSO > 82e88ff1ea94 hrtimer: Revert CLOCK_MONOTONIC_RAW support > 9c808765e88e hrtimer: Add support for CLOCK_MONOTONIC_RAW > > stable-rc-linux-4.4.y > > > stable-rc-linux-4.9.y > 99f66b5182a4 arm64/vdso: Fix nsec handling for CLOCK_MONOTONIC_RAW > 49eea433b326 arm64: Add support for CLOCK_MONOTONIC_RAW in > clock_gettime() vDSO > 82e88ff1ea94 hrtimer: Revert CLOCK_MONOTONIC_RAW support > 9c808765e88e hrtimer: Add support for CLOCK_MONOTONIC_RAW > > Yet, the second fix was backported to all (including 4.4.y): > > stable-rc-linux-4.14.y > 3d88d56c5873 time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting > stable-rc-linux-4.16.y > 3d88d56c5873 time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting > stable-rc-linux-4.4.y > 7c8bd6e07430 time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting > stable-rc-linux-4.9.y > a53bfdda06ac time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting > > Not sure you want to keep it in 4.4, thought it was worth mentioning it. > > Cheers. > > On 24 May 2018 at 22:34, Daniel Sangorrin > wrote: > > Hello Rafael, > > > > The tests fcntl35 and fcntl35_64 should have go from FAIL to PASS. > > https://www.spinics.net/lists/stable/msg239475.html > > > > Looking at > > > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a > 3d7cdea9/testrun/228569/suite/ltp-syscalls-tests/tests/ > > I see that these two tests (and other important tests as well) are being > > SKIPPED. > > > > By the way, I see that select04 FAILS in your case. But in my setup, > > select04 was > working fine (x86_64) in 4.4.132. I will confirm that it still works in > 4.4.133 > > > > Thanks, > > Daniel Sangorrin > > > >> -Original Message- > >> From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] > On > >> Behalf Of Rafael Tinoco > >> Sent: Friday, May 25, 2018 5:32 AM > >> To: Greg Kroah-Hartman > >> Cc: linux-kernel@vger.kernel.org; sh...@kernel.org; patc...@kernelci.org; > >> lkft-tri...@lists.linaro.org; ben.hutchi...@codethink.co.uk; > >> sta...@vger.kernel.org; a...@linux-foundation.org; > >> torva...@linux-foundation.org; li...@roeck-us.net > >> Subject: Re: [PATCH 4.4 00/92] 4.4.133-stable review > >> > >> > > kernel: 4.4.133-rc1 > >> > > git repo: > >> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > >> > > git branch: linux-4.4.y > >> > > git commit: 915a3d7cdea9daa9e9fb6b855f10c1312e6910c4 > >> > > git describe: v4.4.132-93-g915a3d7cdea9 > >> > > Test details: > >> > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a > >> 3d7cdea9 > >> > > > >> > > > >> > > No regressions (compared to build v4.4.132-71-g180635995c36) > >> > > >> > It should have gotten better, as there was a fix in here for at least 2 > >> > LTP tests that we previously were not passing. I don't know why you all > >> > were not reporting that in the past, it took someone else randomly > >> > deciding to run LTP to report it to me... > >> > > >> > Why did an improvement in results not show up? > >> > >> Are you referring to the CLOCK_MONOTONIC_RAW fix for the arm64 vDSO ? > >> I think that CLOCK_MONOTONIC_RAW in VDSO wasn't backported to 4.4.y > >> (commit 49eea433b326 in mainline) so this "fix" is changing the > >> timekeeping sauce (that would fix MONOTONIC RAW) but not for 4.4.y in > >> ARM64. Needs review though :\ > > > > > >
RE: [PATCH 4.4 00/92] 4.4.133-stable review
Hello Rafael, The tests fcntl35 and fcntl35_64 should have go from FAIL to PASS. https://www.spinics.net/lists/stable/msg239475.html Looking at https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a3d7cdea9/testrun/228569/suite/ltp-syscalls-tests/tests/ I see that these two tests (and other important tests as well) are being SKIPPED. By the way, I see that select04 FAILS in your case. But in my setup, select04 was working fine (x86_64) in 4.4.132. I will confirm that it still works in 4.4.133 Thanks, Daniel Sangorrin > -Original Message- > From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] On > Behalf Of Rafael Tinoco > Sent: Friday, May 25, 2018 5:32 AM > To: Greg Kroah-Hartman <gre...@linuxfoundation.org> > Cc: linux-kernel@vger.kernel.org; sh...@kernel.org; patc...@kernelci.org; > lkft-tri...@lists.linaro.org; ben.hutchi...@codethink.co.uk; > sta...@vger.kernel.org; a...@linux-foundation.org; > torva...@linux-foundation.org; li...@roeck-us.net > Subject: Re: [PATCH 4.4 00/92] 4.4.133-stable review > > > > kernel: 4.4.133-rc1 > > > git repo: > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > > > git branch: linux-4.4.y > > > git commit: 915a3d7cdea9daa9e9fb6b855f10c1312e6910c4 > > > git describe: v4.4.132-93-g915a3d7cdea9 > > > Test details: > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a > 3d7cdea9 > > > > > > > > > No regressions (compared to build v4.4.132-71-g180635995c36) > > > > It should have gotten better, as there was a fix in here for at least 2 > > LTP tests that we previously were not passing. I don't know why you all > > were not reporting that in the past, it took someone else randomly > > deciding to run LTP to report it to me... > > > > Why did an improvement in results not show up? > > Are you referring to the CLOCK_MONOTONIC_RAW fix for the arm64 vDSO ? > I think that CLOCK_MONOTONIC_RAW in VDSO wasn't backported to 4.4.y > (commit 49eea433b326 in mainline) so this "fix" is changing the > timekeeping sauce (that would fix MONOTONIC RAW) but not for 4.4.y in > ARM64. Needs review though :\
RE: [PATCH 4.4 00/92] 4.4.133-stable review
Hello Rafael, The tests fcntl35 and fcntl35_64 should have go from FAIL to PASS. https://www.spinics.net/lists/stable/msg239475.html Looking at https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a3d7cdea9/testrun/228569/suite/ltp-syscalls-tests/tests/ I see that these two tests (and other important tests as well) are being SKIPPED. By the way, I see that select04 FAILS in your case. But in my setup, select04 was working fine (x86_64) in 4.4.132. I will confirm that it still works in 4.4.133 Thanks, Daniel Sangorrin > -Original Message- > From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] On > Behalf Of Rafael Tinoco > Sent: Friday, May 25, 2018 5:32 AM > To: Greg Kroah-Hartman > Cc: linux-kernel@vger.kernel.org; sh...@kernel.org; patc...@kernelci.org; > lkft-tri...@lists.linaro.org; ben.hutchi...@codethink.co.uk; > sta...@vger.kernel.org; a...@linux-foundation.org; > torva...@linux-foundation.org; li...@roeck-us.net > Subject: Re: [PATCH 4.4 00/92] 4.4.133-stable review > > > > kernel: 4.4.133-rc1 > > > git repo: > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > > > git branch: linux-4.4.y > > > git commit: 915a3d7cdea9daa9e9fb6b855f10c1312e6910c4 > > > git describe: v4.4.132-93-g915a3d7cdea9 > > > Test details: > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a > 3d7cdea9 > > > > > > > > > No regressions (compared to build v4.4.132-71-g180635995c36) > > > > It should have gotten better, as there was a fix in here for at least 2 > > LTP tests that we previously were not passing. I don't know why you all > > were not reporting that in the past, it took someone else randomly > > deciding to run LTP to report it to me... > > > > Why did an improvement in results not show up? > > Are you referring to the CLOCK_MONOTONIC_RAW fix for the arm64 vDSO ? > I think that CLOCK_MONOTONIC_RAW in VDSO wasn't backported to 4.4.y > (commit 49eea433b326 in mainline) so this "fix" is changing the > timekeeping sauce (that would fix MONOTONIC RAW) but not for 4.4.y in > ARM64. Needs review though :\
RE: [kernel-hardening] [RFC PATCH 1/1] seccomp: provide information about the previous syscall
Hi, Jann, Andy, Alexei, Kees and Paul: thanks a lot for your comments on my RFC!!. There were a few important points that I didn't mention but are critical to understand what I was trying to do. The focus of the patch was on protecting "real-time embedded IoT devices" such as a PLC (programmable logic controller) inside a factory assembly line . They have a few important properties that I took into consideration: - They often rely on firewall technology, and are not updated for many years (~20 years). For that reason, I think that a white-list approach (define the correct behaviour) seems suitable. Note also that the typical problem of white list approaches, false-positives, is unlikely to occur because they are very deterministic systems. - No asynchronous signal handlers: real-time applications need deterministic response times. For that reason, signals are handled synchronously typically by using 'sigtimedwait' on a separate thread. - Initialization vs cycle: real-time applications usually have an initialization phase where memory and stack are locked into RAM and threads are created. After the initialization phase, threads typically loop through periodic cycles and perform their tasks. The important point here is that once the initialization is done we can ban any further calls to 'clone', 'execve', 'mprotect' and the like. This can be done already by installing an extra filter. For the cyclic phase, my patch would allow enforcing the order of the system calls inside the cycles. (e.g.: read sensor, send a message, and write to an actuator). Despite the fact that the attacker cannot call 'clone' anymore, he could try to alter the control of an external actuator (e.g. a motor) by using the 'ioctl' system call for example. - Mimicry: as I mentioned in the cover letter (and Jann showed with his ROP attack) if the attacker is able to emulate the system call's order (plus its arguments and the address from which the call was made) this patch can be bypassed. However, note that this is not easy for several reasons: + the attacker may need a long stack to mimic all the system calls and their arguments. + the stealthy attacker must make sure the real-time application does not crash, miss any of its deadlines or cause deadline misses in other apps [Note] Real-time application binaries are usually closed source so this might require quite a bit of effort. + randomized system calls: applications could randomly activate dummy system calls each time they are instantiated (and adjust their BPF filter, which should later be zeroed). In this case, the attacker (or virus) would need to figure out which dummy system calls have to be mimicked and prepare a stack accordingly. This seems challenging. [Note] under a brute force attack, the application may just raise an alarm, activate a redundant node (not connected to the network) and commit digital suicide :). About the ABI, by all means I don't want to break it. If putting the field at the end does not break it, as Alexei mentioned, I can change it. Also I would be glad to review the SECCOMP_FILTER_FLAG_TSYNC flag mentioned by Jann in case there is any interest. However, I'll understand the NACK if you think that the maintenance is not worth it as Andy mentioned; that it can be bypassed under certain conditions; or the fact that it focuses on a particular type of systems. I will keep reading the messages in the kernel-hardening list and see if I find another topic to contribute :). Thanks a lot for your consideration and comments, Daniel
RE: [kernel-hardening] [RFC PATCH 1/1] seccomp: provide information about the previous syscall
Hi, Jann, Andy, Alexei, Kees and Paul: thanks a lot for your comments on my RFC!!. There were a few important points that I didn't mention but are critical to understand what I was trying to do. The focus of the patch was on protecting "real-time embedded IoT devices" such as a PLC (programmable logic controller) inside a factory assembly line . They have a few important properties that I took into consideration: - They often rely on firewall technology, and are not updated for many years (~20 years). For that reason, I think that a white-list approach (define the correct behaviour) seems suitable. Note also that the typical problem of white list approaches, false-positives, is unlikely to occur because they are very deterministic systems. - No asynchronous signal handlers: real-time applications need deterministic response times. For that reason, signals are handled synchronously typically by using 'sigtimedwait' on a separate thread. - Initialization vs cycle: real-time applications usually have an initialization phase where memory and stack are locked into RAM and threads are created. After the initialization phase, threads typically loop through periodic cycles and perform their tasks. The important point here is that once the initialization is done we can ban any further calls to 'clone', 'execve', 'mprotect' and the like. This can be done already by installing an extra filter. For the cyclic phase, my patch would allow enforcing the order of the system calls inside the cycles. (e.g.: read sensor, send a message, and write to an actuator). Despite the fact that the attacker cannot call 'clone' anymore, he could try to alter the control of an external actuator (e.g. a motor) by using the 'ioctl' system call for example. - Mimicry: as I mentioned in the cover letter (and Jann showed with his ROP attack) if the attacker is able to emulate the system call's order (plus its arguments and the address from which the call was made) this patch can be bypassed. However, note that this is not easy for several reasons: + the attacker may need a long stack to mimic all the system calls and their arguments. + the stealthy attacker must make sure the real-time application does not crash, miss any of its deadlines or cause deadline misses in other apps [Note] Real-time application binaries are usually closed source so this might require quite a bit of effort. + randomized system calls: applications could randomly activate dummy system calls each time they are instantiated (and adjust their BPF filter, which should later be zeroed). In this case, the attacker (or virus) would need to figure out which dummy system calls have to be mimicked and prepare a stack accordingly. This seems challenging. [Note] under a brute force attack, the application may just raise an alarm, activate a redundant node (not connected to the network) and commit digital suicide :). About the ABI, by all means I don't want to break it. If putting the field at the end does not break it, as Alexei mentioned, I can change it. Also I would be glad to review the SECCOMP_FILTER_FLAG_TSYNC flag mentioned by Jann in case there is any interest. However, I'll understand the NACK if you think that the maintenance is not worth it as Andy mentioned; that it can be bypassed under certain conditions; or the fact that it focuses on a particular type of systems. I will keep reading the messages in the kernel-hardening list and see if I find another topic to contribute :). Thanks a lot for your consideration and comments, Daniel
[RFC PATCH 0/1] Adding previous syscall context to seccomp
Hi, During my presentation last year at Linuxcon Japan [1], I released a proof-of-concept patch [2] for the seccomp subsystem. The main purpose of that patch was to let applications restrict the order in which their system calls are requested. In more technical terms, a host-based anomaly intrusion detection system (HIDS) that uses call sequence monitoring for detecting unusual patterns. For example, to detect when the execution flow unexpectedly diverts towards the 'mprotect' syscall, perhaps after a stack overflow. The main target for the patch was embedded real-time systems where applications have a high degree of determinism. For that reason, my original proof-of-concept patch was using bitmaps, which allow for a constant O(1) overhead (correct me if I'm wrong but I think the current seccomp-filter implementation introduces an O(n) overhead proportional to the number of system calls that one wants to allow or prohibit). However, I realized that it would be too hard to merge with the current code. I have adapted my original patch which now allows BPF filters to retrieve information regarding the previous system call requested by the application. The patch can be tested on linux-master as follows (tested on Debian Jessie x86_64): $ sudo vi /usr/include/linux/seccomp.h ... struct seccomp_data { int nr; int prev_nr; <-- add this entry ... $ cd samples/seccomp/ $ make bpf-prev $ ./bpf-prev parent msgsnd: hello parent msgrcv after prctl: hello (128 bytes) parent msgsnd: world parent msgrcv after msgsnd: world (128 bytes) parent msgsnd: this is mars child msgrcv after clone: this is mars (128 bytes) parent: child 11409 exited with status 0 Should fail: Bad system call For simplicity, at the moment the patch only records the last requested system call. Despite being vulnerable to specially- crafted mimicry attacks, I think it can deter common attacks specially during the "initial phase" of the attack (e.g.: a return-oriented jump). It could also be extended with longer call sequences (NGRAMs), call stack and call site information, or scratch memory for restricting a system call to the application's initalization for example. However, I'm not sure if such complexity would be worth. I would like to know at this early stage if any of you is interested in this type of approach and what you think about it. Thanks, Daniel [1] Kernel security hacking for the Internet of Things http://events.linuxfoundation.jp/sites/events/files/slides/linuxcon-2015-daniel-sangorrin-final.pdf [2] https://github.com/sangorrin/linuxcon-japan-2015/tree/master/hids Daniel Sangorrin (1): seccomp: provide information about the previous syscall include/linux/seccomp.h | 2 + include/uapi/linux/seccomp.h | 2 + kernel/seccomp.c | 10 +++ samples/seccomp/.gitignore | 1 + samples/seccomp/Makefile | 9 ++- samples/seccomp/bpf-prev.c | 160 +++ 6 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 samples/seccomp/bpf-prev.c -- 2.1.4
[RFC PATCH 1/1] seccomp: provide information about the previous syscall
This patch allows applications to restrict the order in which its system calls may be requested. In order to do that, we provide seccomp-BPF scripts with information about the previous system call requested. An example use case consists of detecting (and stopping) return oriented attacks that disturb the normal execution flow of a user program. Signed-off-by: Daniel Sangorrin --- include/linux/seccomp.h | 2 + include/uapi/linux/seccomp.h | 2 + kernel/seccomp.c | 10 +++ samples/seccomp/.gitignore | 1 + samples/seccomp/Makefile | 9 ++- samples/seccomp/bpf-prev.c | 160 +++ 6 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 samples/seccomp/bpf-prev.c diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 2296e6b..8c6de6d 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -16,6 +16,7 @@ struct seccomp_filter; * * @mode: indicates one of the valid values above for controlled * system calls available to a process. + * @prev_nr: stores the previous system call number. * @filter: must always point to a valid seccomp-filter or NULL as it is * accessed without locking during system call entry. * @@ -24,6 +25,7 @@ struct seccomp_filter; */ struct seccomp { int mode; + int prev_nr; struct seccomp_filter *filter; }; diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 0f238a4..42775dc 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -38,6 +38,7 @@ /** * struct seccomp_data - the format the BPF program executes over. * @nr: the system call number + * @prev_nr: the previous system call number * @arch: indicates system call convention as an AUDIT_ARCH_* value *as defined in . * @instruction_pointer: at the time of the system call. @@ -46,6 +47,7 @@ */ struct seccomp_data { int nr; + int prev_nr; __u32 arch; __u64 instruction_pointer; __u64 args[6]; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 580ac2d..98b2c9d3 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -190,6 +190,8 @@ static u32 seccomp_run_filters(struct seccomp_data *sd) sd = _local; } + sd->prev_nr = current->seccomp.prev_nr; + /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). @@ -200,6 +202,9 @@ static u32 seccomp_run_filters(struct seccomp_data *sd) if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; } + + current->seccomp.prev_nr = sd->nr; + return ret; } #endif /* CONFIG_SECCOMP_FILTER */ @@ -443,6 +448,11 @@ static long seccomp_attach_filter(unsigned int flags, return ret; } + /* Initialize the prev_nr field only once */ + if (current->seccomp.filter == NULL) + current->seccomp.prev_nr = + syscall_get_nr(current, task_pt_regs(current)); + /* * If there is an existing filter, make it the prev and don't drop its * task reference. diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore index 78fb781..11dda7a 100644 --- a/samples/seccomp/.gitignore +++ b/samples/seccomp/.gitignore @@ -1,3 +1,4 @@ bpf-direct bpf-fancy dropper +bpf-prev diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile index 1b4e4b8..b50821c 100644 --- a/samples/seccomp/Makefile +++ b/samples/seccomp/Makefile @@ -1,7 +1,7 @@ # kbuild trick to avoid linker error. Can be omitted if a module is built. obj- := dummy.o -hostprogs-$(CONFIG_SECCOMP_FILTER) := bpf-fancy dropper bpf-direct +hostprogs-$(CONFIG_SECCOMP_FILTER) := bpf-fancy dropper bpf-direct bpf-prev HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include @@ -17,6 +17,11 @@ HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include bpf-direct-objs := bpf-direct.o +HOSTCFLAGS_bpf-prev.o += -I$(objtree)/usr/include +HOSTCFLAGS_bpf-prev.o += -idirafter $(objtree)/include +bpf-prev-objs := bpf-prev.o + + # Try to match the kernel target. ifndef CROSS_COMPILE ifndef CONFIG_64BIT @@ -29,10 +34,12 @@ MFLAG = -m31 endif HOSTCFLAGS_bpf-direct.o += $(MFLAG) +HOSTCFLAGS_bpf-prev.o += $(MFLAG) HOSTCFLAGS_dropper.o += $(MFLAG) HOSTCFLAGS_bpf-helper.o += $(MFLAG) HOSTCFLAGS_bpf-fancy.o += $(MFLAG) HOSTLOADLIBES_bpf-direct += $(MFLAG) +HOSTLOADLIBES_bpf-prev += $(MFLAG) HOSTLOADLIBES_bpf-fancy += $(MFLAG) HOSTLOADLIBES_dropper += $(MFLAG) endif diff --git a/samples/seccomp/bpf-prev.c b/samples/seccomp/bpf-prev.c new file mode 100644 index 000..138c584 --- /dev/null +++ b/samples/seccomp/bpf-prev.c @@ -0,0 +1,160 @@ +/* + *
[RFC PATCH 1/1] seccomp: provide information about the previous syscall
This patch allows applications to restrict the order in which its system calls may be requested. In order to do that, we provide seccomp-BPF scripts with information about the previous system call requested. An example use case consists of detecting (and stopping) return oriented attacks that disturb the normal execution flow of a user program. Signed-off-by: Daniel Sangorrin <daniel.sangor...@toshiba.co.jp> --- include/linux/seccomp.h | 2 + include/uapi/linux/seccomp.h | 2 + kernel/seccomp.c | 10 +++ samples/seccomp/.gitignore | 1 + samples/seccomp/Makefile | 9 ++- samples/seccomp/bpf-prev.c | 160 +++ 6 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 samples/seccomp/bpf-prev.c diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 2296e6b..8c6de6d 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -16,6 +16,7 @@ struct seccomp_filter; * * @mode: indicates one of the valid values above for controlled * system calls available to a process. + * @prev_nr: stores the previous system call number. * @filter: must always point to a valid seccomp-filter or NULL as it is * accessed without locking during system call entry. * @@ -24,6 +25,7 @@ struct seccomp_filter; */ struct seccomp { int mode; + int prev_nr; struct seccomp_filter *filter; }; diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 0f238a4..42775dc 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -38,6 +38,7 @@ /** * struct seccomp_data - the format the BPF program executes over. * @nr: the system call number + * @prev_nr: the previous system call number * @arch: indicates system call convention as an AUDIT_ARCH_* value *as defined in . * @instruction_pointer: at the time of the system call. @@ -46,6 +47,7 @@ */ struct seccomp_data { int nr; + int prev_nr; __u32 arch; __u64 instruction_pointer; __u64 args[6]; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 580ac2d..98b2c9d3 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -190,6 +190,8 @@ static u32 seccomp_run_filters(struct seccomp_data *sd) sd = _local; } + sd->prev_nr = current->seccomp.prev_nr; + /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). @@ -200,6 +202,9 @@ static u32 seccomp_run_filters(struct seccomp_data *sd) if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; } + + current->seccomp.prev_nr = sd->nr; + return ret; } #endif /* CONFIG_SECCOMP_FILTER */ @@ -443,6 +448,11 @@ static long seccomp_attach_filter(unsigned int flags, return ret; } + /* Initialize the prev_nr field only once */ + if (current->seccomp.filter == NULL) + current->seccomp.prev_nr = + syscall_get_nr(current, task_pt_regs(current)); + /* * If there is an existing filter, make it the prev and don't drop its * task reference. diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore index 78fb781..11dda7a 100644 --- a/samples/seccomp/.gitignore +++ b/samples/seccomp/.gitignore @@ -1,3 +1,4 @@ bpf-direct bpf-fancy dropper +bpf-prev diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile index 1b4e4b8..b50821c 100644 --- a/samples/seccomp/Makefile +++ b/samples/seccomp/Makefile @@ -1,7 +1,7 @@ # kbuild trick to avoid linker error. Can be omitted if a module is built. obj- := dummy.o -hostprogs-$(CONFIG_SECCOMP_FILTER) := bpf-fancy dropper bpf-direct +hostprogs-$(CONFIG_SECCOMP_FILTER) := bpf-fancy dropper bpf-direct bpf-prev HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include @@ -17,6 +17,11 @@ HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include bpf-direct-objs := bpf-direct.o +HOSTCFLAGS_bpf-prev.o += -I$(objtree)/usr/include +HOSTCFLAGS_bpf-prev.o += -idirafter $(objtree)/include +bpf-prev-objs := bpf-prev.o + + # Try to match the kernel target. ifndef CROSS_COMPILE ifndef CONFIG_64BIT @@ -29,10 +34,12 @@ MFLAG = -m31 endif HOSTCFLAGS_bpf-direct.o += $(MFLAG) +HOSTCFLAGS_bpf-prev.o += $(MFLAG) HOSTCFLAGS_dropper.o += $(MFLAG) HOSTCFLAGS_bpf-helper.o += $(MFLAG) HOSTCFLAGS_bpf-fancy.o += $(MFLAG) HOSTLOADLIBES_bpf-direct += $(MFLAG) +HOSTLOADLIBES_bpf-prev += $(MFLAG) HOSTLOADLIBES_bpf-fancy += $(MFLAG) HOSTLOADLIBES_dropper += $(MFLAG) endif diff --git a/samples/seccomp/bpf-prev.c b/samples/seccomp/bpf-prev.c new file mode 100644 index 000..138c584 --- /dev/null +++ b/sample
[RFC PATCH 0/1] Adding previous syscall context to seccomp
Hi, During my presentation last year at Linuxcon Japan [1], I released a proof-of-concept patch [2] for the seccomp subsystem. The main purpose of that patch was to let applications restrict the order in which their system calls are requested. In more technical terms, a host-based anomaly intrusion detection system (HIDS) that uses call sequence monitoring for detecting unusual patterns. For example, to detect when the execution flow unexpectedly diverts towards the 'mprotect' syscall, perhaps after a stack overflow. The main target for the patch was embedded real-time systems where applications have a high degree of determinism. For that reason, my original proof-of-concept patch was using bitmaps, which allow for a constant O(1) overhead (correct me if I'm wrong but I think the current seccomp-filter implementation introduces an O(n) overhead proportional to the number of system calls that one wants to allow or prohibit). However, I realized that it would be too hard to merge with the current code. I have adapted my original patch which now allows BPF filters to retrieve information regarding the previous system call requested by the application. The patch can be tested on linux-master as follows (tested on Debian Jessie x86_64): $ sudo vi /usr/include/linux/seccomp.h ... struct seccomp_data { int nr; int prev_nr; <-- add this entry ... $ cd samples/seccomp/ $ make bpf-prev $ ./bpf-prev parent msgsnd: hello parent msgrcv after prctl: hello (128 bytes) parent msgsnd: world parent msgrcv after msgsnd: world (128 bytes) parent msgsnd: this is mars child msgrcv after clone: this is mars (128 bytes) parent: child 11409 exited with status 0 Should fail: Bad system call For simplicity, at the moment the patch only records the last requested system call. Despite being vulnerable to specially- crafted mimicry attacks, I think it can deter common attacks specially during the "initial phase" of the attack (e.g.: a return-oriented jump). It could also be extended with longer call sequences (NGRAMs), call stack and call site information, or scratch memory for restricting a system call to the application's initalization for example. However, I'm not sure if such complexity would be worth. I would like to know at this early stage if any of you is interested in this type of approach and what you think about it. Thanks, Daniel [1] Kernel security hacking for the Internet of Things http://events.linuxfoundation.jp/sites/events/files/slides/linuxcon-2015-daniel-sangorrin-final.pdf [2] https://github.com/sangorrin/linuxcon-japan-2015/tree/master/hids Daniel Sangorrin (1): seccomp: provide information about the previous syscall include/linux/seccomp.h | 2 + include/uapi/linux/seccomp.h | 2 + kernel/seccomp.c | 10 +++ samples/seccomp/.gitignore | 1 + samples/seccomp/Makefile | 9 ++- samples/seccomp/bpf-prev.c | 160 +++ 6 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 samples/seccomp/bpf-prev.c -- 2.1.4
Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce
On 2014/04/25 13:51, Viresh Kumar wrote: > On 25 April 2014 06:01, Daniel Sangorrin > wrote: >> >>>> I can't keep it as a separate patch and so would be required to merge >>>> it into my original patch.. >>> >>> And the reason being: "No patch is supposed to break things, otherwise >>> git bisect wouldn't work smoothly".. And so git bisect would complain >>> this issue after my patch and so I have to merge the fixes you gave into >>> the original patch as its not yet merged. >>> -- >>> To unsubscribe from this list: send the line "unsubscribe cgroups" in >>> the body of a message to majord...@vger.kernel.org >>> More majordomo info at http://vger.kernel.org/majordomo-info.html >> >> Ok, no problem then. > > Do you want me to add your Tested-by for my next version ? > > Sure. Tested-by: Daniel Sangorrin -- Toshiba Corporate Software Engineering Center Daniel SANGORRIN E-mail: daniel.sangor...@toshiba.co.jp -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2] timer_stats: add core information to event counters
Add information specifying the CPU core where timer callbacks (events) were executed to the output of /proc/timer_stats. Signed-off-by: Daniel Sangorrin Signed-off-by: Yoshitake Kobayashi --- Documentation/timers/timer_stats.txt | 43 kernel/time/timer_stats.c| 20 + 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/Documentation/timers/timer_stats.txt b/Documentation/timers/timer_stats.txt index 8abd40b..aeb467f 100644 --- a/Documentation/timers/timer_stats.txt +++ b/Documentation/timers/timer_stats.txt @@ -21,7 +21,7 @@ Linux system over a sample period: - the name of the process which initialized the timer - the function where the timer was initialized - the callback function which is associated to the timer -- the number of events (callbacks) +- the number of events (callbacks) executed per core timer_stats adds an entry to /proc: /proc/timer_stats @@ -45,24 +45,29 @@ readouts. Sample output of /proc/timer_stats: -Timerstats sample period: 3.888770 s - 12, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) - 15, 1 swapper hcd_submit_urb (rh_timer_func) - 4, 959 kedacschedule_timeout (process_timeout) - 1, 0 swapper page_writeback_init (wb_timer_fn) - 28, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) - 22, 2948 IRQ 4tty_flip_buffer_push (delayed_work_timer_fn) - 3, 3100 bash schedule_timeout (process_timeout) - 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) - 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) - 1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer) - 1, 2292 ip __netdev_watchdog_up (dev_watchdog) - 1,23 events/1 do_cache_clean (delayed_work_timer_fn) -90 total events, 30.0 events/sec - -The first column is the number of events, the second column the pid, the third -column is the name of the process. The forth column shows the function which -initialized the timer and in parenthesis the callback function which was +Timer Stats Version: v0.3 +Sample period: 4.365 s +Collection: inactive + 0, 4366, 0 swapper/1tick_setup_sched_timer (tick_sched_timer) + 4366, 0, 1 swapper/0tick_setup_sched_timer (tick_sched_timer) + 0, 5, 1132 sshd sk_reset_timer (tcp_write_timer) + 0, 4, 515 apache2 schedule_hrtimeout_range_clock (hrtimer_wakeup) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) + 0, 7, 0 swapper/1sk_reset_timer (tcp_delack_timer) + 3, 4, 0 swapper/0clocksource_watchdog (clocksource_watchdog) +0D, 3D,13 kworker/1:0 queue_delayed_work_on (delayed_work_timer_fn) +3D, 0D,22 kworker/0:1 queue_delayed_work_on (delayed_work_timer_fn) + 1, 0,22 kworker/0:1 e1000_watchdog_task (e1000_watchdog) +8842 total events, 2025.658 events/sec + +The first comma-separated columns represent the number of events (one +column per present core), the next column the pid, and the next one is the +name of the process. The last column shows the function which +initialized the timer and in parentheses the callback function which was executed on expiry. Thomas, Ingo diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1fb08f2..53d1544 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -67,7 +67,7 @@ struct entry { /* * Number of timeout events: */ - unsigned long count; + unsigned long count[NR_CPUS]; unsigned inttimer_flag; /* @@ -203,7 +203,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) curr = alloc_entry(); if (curr) { *curr = *entry; - curr->count = 0; + memset(curr->count, 0, sizeof(curr->count)); curr->next = NULL; memcpy(curr->comm, comm, TASK_COMM_LEN); @@ -259,7 +259,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, entry = tstat_lookup(, comm); if (likely(entry)) - entry->count++; + entry->count[raw_smp_processor_id()]++; else atomic_inc(_count); @@ -284,7 +284,7 @@ static int tstats_show(struct seq_file *m, void *v) unsigned long ms; long events = 0; ktime_t time; - int
Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce
>> I can't keep it as a separate patch and so would be required to merge >> it into my original patch.. > > And the reason being: "No patch is supposed to break things, otherwise > git bisect wouldn't work smoothly".. And so git bisect would complain > this issue after my patch and so I have to merge the fixes you gave into > the original patch as its not yet merged. > -- > To unsubscribe from this list: send the line "unsubscribe cgroups" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html Ok, no problem then. Thanks, Daniel -- Toshiba Corporate Software Engineering Center Daniel SANGORRIN E-mail: daniel.sangor...@toshiba.co.jp -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] timer_stats: add core information to event counters
On 2014/04/25 2:54, Randy Dunlap wrote: > On 04/24/14 00:47, Daniel Sangorrin wrote: >> Add information specifying the CPU core where timer callbacks (events) >> were executed to the output of /proc/timer_stats. >> >> Signed-off-by: Daniel Sangorrin >> Signed-off-by: Yoshitake Kobayashi >> --- >> Documentation/timers/timer_stats.txt | 41 >> >> kernel/time/timer_stats.c| 20 ++ >> 2 files changed, 34 insertions(+), 27 deletions(-) >> >> diff --git a/Documentation/timers/timer_stats.txt >> b/Documentation/timers/timer_stats.txt >> index 8abd40b..e54077a 100644 >> --- a/Documentation/timers/timer_stats.txt >> +++ b/Documentation/timers/timer_stats.txt >> @@ -21,7 +21,7 @@ Linux system over a sample period: >> - the name of the process which initialized the timer >> - the function where the timer was initialized >> - the callback function which is associated to the timer >> -- the number of events (callbacks) >> +- the number of events (callbacks) executed per core >> >> timer_stats adds an entry to /proc: /proc/timer_stats >> >> @@ -45,23 +45,28 @@ readouts. >> >> Sample output of /proc/timer_stats: >> >> -Timerstats sample period: 3.888770 s >> - 12, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) >> - 15, 1 swapper hcd_submit_urb (rh_timer_func) >> - 4, 959 kedacschedule_timeout (process_timeout) >> - 1, 0 swapper page_writeback_init (wb_timer_fn) >> - 28, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) >> - 22, 2948 IRQ 4tty_flip_buffer_push (delayed_work_timer_fn) >> - 3, 3100 bash schedule_timeout (process_timeout) >> - 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) >> - 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) >> - 1, 1 swapper neigh_table_init_no_netlink >> (neigh_periodic_timer) >> - 1, 2292 ip __netdev_watchdog_up (dev_watchdog) >> - 1,23 events/1 do_cache_clean (delayed_work_timer_fn) >> -90 total events, 30.0 events/sec >> - >> -The first column is the number of events, the second column the pid, the >> third >> -column is the name of the process. The forth column shows the function which >> +Timer Stats Version: v0.3 >> +Sample period: 4.365 s >> +Collection: inactive >> + 0, 4366, 0 swapper/1tick_setup_sched_timer >> (tick_sched_timer) >> + 4366, 0, 1 swapper/0tick_setup_sched_timer >> (tick_sched_timer) >> + 0, 5, 1132 sshd sk_reset_timer (tcp_write_timer) >> + 0, 4, 515 apache2 schedule_hrtimeout_range_clock >> (hrtimer_wakeup) >> +16, 0, 0 swapper/0usb_hcd_poll_rh_status >> (rh_timer_func) >> +16, 0, 0 swapper/0usb_hcd_poll_rh_status >> (rh_timer_func) >> +16, 0, 0 swapper/0usb_hcd_poll_rh_status >> (rh_timer_func) >> +16, 0, 0 swapper/0usb_hcd_poll_rh_status >> (rh_timer_func) >> +16, 0, 0 swapper/0usb_hcd_poll_rh_status >> (rh_timer_func) >> + 0, 7, 0 swapper/1sk_reset_timer (tcp_delack_timer) >> + 3, 4, 0 swapper/0clocksource_watchdog >> (clocksource_watchdog) >> +0D, 3D,13 kworker/1:0 queue_delayed_work_on >> (delayed_work_timer_fn) >> +3D, 0D,22 kworker/0:1 queue_delayed_work_on >> (delayed_work_timer_fn) >> + 1, 0,22 kworker/0:1 e1000_watchdog_task (e1000_watchdog) >> +8842 total events, 2025.658 events/sec >> + > > Is the number of events in decimal or hex? If in decimal, what do > 0D and 3D mean? It is in decimal notation. The 'D' means 'deferrable timer' and it is already described in timer_stats.txt. >> +The first comma-separated columns represent the number of events (one >> +column per present core), the next column the pid, and the next one is the >> +name of the process. The last column shows the function which >> initialized the timer and in parenthesis the callback function which was > > parentheses Well spotted. I'll fix it and send it again. Thanks, Daniel -- Toshiba Corporate Software Engineering Center Daniel SANGORRIN E-mail: daniel.sangor...@toshiba.co.jp -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce
On 2014/04/24 17:41, Viresh Kumar wrote: > On 24 April 2014 14:01, Daniel Sangorrin > wrote: >> In kernel/cpuset.c:quiesce_cpuset() you are using the function >> 'smp_call_function_any' which asks CPU cores in 'cpumask' to >> execute the functions 'hrtimer_quiesce_cpu' and 'timer_quiesce_cpu'. >> >> In the case above, 'cpumask' corresponds to core 0. Since I'm forcing >> the call to be executed from core 1 (by using taskset), >> an inter-processor interrupt is sent to core 0 for those functions >> to be executed. > > Ahh, I understood that now :) .. So we are setting cpuset.quiesce from CPU1 > which will do a IPI to get migrate_timers called on CPU0.. I was setting > quiesce > from CPU0 only in my tests :) > > But how does this work fine on x86 then? There we should have exactly same > problem, isn't it? Yeah, I'm not sure why it is working on 3.15 x86_64 but not in 3.10 x86_64. Perhaps it's related to this patch: https://lkml.org/lkml/2013/9/19/349 >> Ok, thank you! I see that you have already fixed the problem. I tested >> your tree on ARM and now it seems to work correctly. > > Yeah, I just pushed your changes as well at the time I wrote last mail :) Oh, I see! Why didn't you just apply the patch on top of your tree so that the information included in the git commit (e.g: my name and mail) remains? This part: cpuset: quiesce: change irq disable/enable by irq save/restore The function __migrate_timers can be called under interrupt context or thread context depending on the core where the system call was executed. In case it executes under interrupt context, it seems a bad idea to leave interrupts enabled after migrating the timers. In fact, this caused kernel errors on the ARM architecture and on the x86_64 architecture with the 3.10 kernel (backported version of the cpuset-quiesce patch). Signed-off-by: Daniel Sangorrin Signed-off-by: Yoshitake Kobayashi Thanks, Daniel -- Toshiba Corporate Software Engineering Center Daniel SANGORRIN E-mail: daniel.sangor...@toshiba.co.jp -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce
On 2014/04/24 16:43, Viresh Kumar wrote: > On 24 April 2014 12:55, Daniel Sangorrin > wrote: >> I tried your set of patches for isolating particular CPU cores from unpinned >> timers. On x86_64 they were working fine, however I found out that on ARM >> they would fail under the following test: > > I am happy that these drew attention from somebody Atleast :) Thanks to you for your hard work. >> # mount -t cpuset none /cpuset >> # cd /cpuset >> # mkdir rt >> # cd rt >> # echo 1 > cpus >> # echo 1 > cpu_exclusive >> # cd >> # taskset 0x2 ./setquiesce.sh <--- contains "echo 1 > /cpuset/rt/quiesce" >> [ 75.622375] [ cut here ] >> [ 75.627258] WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:2595 >> __migrate_hrtimers+0x17c/0x1bc() >> [ 75.636840] DEBUG_LOCKS_WARN_ON(current->hardirq_context) >> [ 75.636840] CPU: 0 PID: 0 Comm: swapper/0 Not tainted >> 3.14.0-rc1-37710-g23c8f02 #1 >> [ 75.649627] [] (unwind_backtrace) from [] >> (show_stack+0x10/0x14) >> [ 75.649627] [] (show_stack) from [] >> (dump_stack+0x78/0x94) >> [ 75.662689] [] (dump_stack) from [] >> (warn_slowpath_common+0x60/0x84) >> [ 75.670410] [] (warn_slowpath_common) from [] >> (warn_slowpath_fmt+0x30/0x40) >> [ 75.677673] [] (warn_slowpath_fmt) from [] >> (__migrate_hrtimers+0x17c/0x1bc) >> [ 75.677673] [] (__migrate_hrtimers) from [] >> (generic_smp_call_function_single_interrupt+0x8c/0x104) >> [ 75.699645] [] (generic_smp_call_function_single_interrupt) >> from [] (handle_IPI+0xa4/0x16c) >> [ 75.706970] [] (handle_IPI) from [] >> (gic_handle_irq+0x54/0x5c) >> [ 75.715087] [] (gic_handle_irq) from [] >> (__irq_svc+0x44/0x5c) >> [ 75.725311] Exception stack(0xc08a3f58 to 0xc08a3fa0) > > I couldn't understand why we went via a interrupt here ? Probably CPU1 > was idle and was woken up with a IPI and then this happened. But in > that case too, > shouldn't the script run from process context instead ? In kernel/cpuset.c:quiesce_cpuset() you are using the function 'smp_call_function_any' which asks CPU cores in 'cpumask' to execute the functions 'hrtimer_quiesce_cpu' and 'timer_quiesce_cpu'. In the case above, 'cpumask' corresponds to core 0. Since I'm forcing the call to be executed from core 1 (by using taskset), an inter-processor interrupt is sent to core 0 for those functions to be executed. >> I also backported your patches to Linux 3.10.y and found the same problem >> both in ARM and x86_64. > > There are very few changes in between 3.10 and latest for timers/hrtimers > and so things are expected to be the same. > >> However, I think I figured out the reason for those >> errors. Please, could you check the patch below (it applies on the top of >> your tree, branch isolate-cpusets) and let me know what you think? > > Okay, just to let you know, I have also found some issues and they are > now pushed in my tree.. Also it is rebased over 3.15-rc2 now. Ok, thank you! I see that you have already fixed the problem. I tested your tree on ARM and now it seems to work correctly. > >> -PATCH STARTS HERE- >> cpuset: quiesce: change irq disable/enable by irq save/restore >> >> The function __migrate_timers can be called under interrupt context >> or thread context depending on the core where the system call was >> executed. In case it executes under interrupt context, it > > How exactly? See my reply above. >> seems a bad idea to leave interrupts enabled after migrating the >> timers. In fact, this caused kernel errors on the ARM architecture and >> on the x86_64 architecture with the 3.10 kernel (backported version >> of the cpuset-quiesce patch). > > I can't keep it as a separate patch and so would be required to merge > it into my original patch.. > > Thanks for your inputs :) > -- > To unsubscribe from this list: send the line "unsubscribe cgroups" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > Thanks, Daniel -- Toshiba Corporate Software Engineering Center Daniel SANGORRIN E-mail: daniel.sangor...@toshiba.co.jp -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] timer_stats: add core information to event counters
Hi, This patch adds information specifying the CPU core where timer callbacks (events) were executed to the output of /proc/timer_stats. I tested the patch on a dual-core x86_64 and another dual-core ARM board. This patch may be useful for debugging timer execution in multi-core environments. Daniel Sangorrin (1): timer_stats: add core information to event counters Documentation/timers/timer_stats.txt | 41 kernel/time/timer_stats.c| 20 ++ 2 files changed, 34 insertions(+), 27 deletions(-) -- 1.8.1.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] timer_stats: add core information to event counters
Add information specifying the CPU core where timer callbacks (events) were executed to the output of /proc/timer_stats. Signed-off-by: Daniel Sangorrin Signed-off-by: Yoshitake Kobayashi --- Documentation/timers/timer_stats.txt | 41 kernel/time/timer_stats.c| 20 ++ 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/Documentation/timers/timer_stats.txt b/Documentation/timers/timer_stats.txt index 8abd40b..e54077a 100644 --- a/Documentation/timers/timer_stats.txt +++ b/Documentation/timers/timer_stats.txt @@ -21,7 +21,7 @@ Linux system over a sample period: - the name of the process which initialized the timer - the function where the timer was initialized - the callback function which is associated to the timer -- the number of events (callbacks) +- the number of events (callbacks) executed per core timer_stats adds an entry to /proc: /proc/timer_stats @@ -45,23 +45,28 @@ readouts. Sample output of /proc/timer_stats: -Timerstats sample period: 3.888770 s - 12, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) - 15, 1 swapper hcd_submit_urb (rh_timer_func) - 4, 959 kedacschedule_timeout (process_timeout) - 1, 0 swapper page_writeback_init (wb_timer_fn) - 28, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) - 22, 2948 IRQ 4tty_flip_buffer_push (delayed_work_timer_fn) - 3, 3100 bash schedule_timeout (process_timeout) - 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) - 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) - 1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer) - 1, 2292 ip __netdev_watchdog_up (dev_watchdog) - 1,23 events/1 do_cache_clean (delayed_work_timer_fn) -90 total events, 30.0 events/sec - -The first column is the number of events, the second column the pid, the third -column is the name of the process. The forth column shows the function which +Timer Stats Version: v0.3 +Sample period: 4.365 s +Collection: inactive + 0, 4366, 0 swapper/1tick_setup_sched_timer (tick_sched_timer) + 4366, 0, 1 swapper/0tick_setup_sched_timer (tick_sched_timer) + 0, 5, 1132 sshd sk_reset_timer (tcp_write_timer) + 0, 4, 515 apache2 schedule_hrtimeout_range_clock (hrtimer_wakeup) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) + 0, 7, 0 swapper/1sk_reset_timer (tcp_delack_timer) + 3, 4, 0 swapper/0clocksource_watchdog (clocksource_watchdog) +0D, 3D,13 kworker/1:0 queue_delayed_work_on (delayed_work_timer_fn) +3D, 0D,22 kworker/0:1 queue_delayed_work_on (delayed_work_timer_fn) + 1, 0,22 kworker/0:1 e1000_watchdog_task (e1000_watchdog) +8842 total events, 2025.658 events/sec + +The first comma-separated columns represent the number of events (one +column per present core), the next column the pid, and the next one is the +name of the process. The last column shows the function which initialized the timer and in parenthesis the callback function which was executed on expiry. diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1fb08f2..53d1544 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -67,7 +67,7 @@ struct entry { /* * Number of timeout events: */ - unsigned long count; + unsigned long count[NR_CPUS]; unsigned inttimer_flag; /* @@ -203,7 +203,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) curr = alloc_entry(); if (curr) { *curr = *entry; - curr->count = 0; + memset(curr->count, 0, sizeof(curr->count)); curr->next = NULL; memcpy(curr->comm, comm, TASK_COMM_LEN); @@ -259,7 +259,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, entry = tstat_lookup(, comm); if (likely(entry)) - entry->count++; + entry->count[raw_smp_processor_id()]++; else atomic_inc(_count); @@ -284,7 +284,7 @@ static int tstats_show(struct seq_file *m, void *v) unsigned long ms; long events = 0; ktime_t time; - int i; + int i, cpu; mutex_lock(_mutex); /* @@ -307,19 +307,21 @@ static int tstats
Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce
Dear Viresh Kumar, I tried your set of patches for isolating particular CPU cores from unpinned timers. On x86_64 they were working fine, however I found out that on ARM they would fail under the following test: # mount -t cpuset none /cpuset # cd /cpuset # mkdir rt # cd rt # echo 1 > cpus # echo 1 > cpu_exclusive # cd # taskset 0x2 ./setquiesce.sh <--- contains "echo 1 > /cpuset/rt/quiesce" [ 75.622375] [ cut here ] [ 75.627258] WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:2595 __migrate_hrtimers+0x17c/0x1bc() [ 75.636840] DEBUG_LOCKS_WARN_ON(current->hardirq_context) [ 75.636840] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.14.0-rc1-37710-g23c8f02 #1 [ 75.649627] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 75.649627] [] (show_stack) from [] (dump_stack+0x78/0x94) [ 75.662689] [] (dump_stack) from [] (warn_slowpath_common+0x60/0x84) [ 75.670410] [] (warn_slowpath_common) from [] (warn_slowpath_fmt+0x30/0x40) [ 75.677673] [] (warn_slowpath_fmt) from [] (__migrate_hrtimers+0x17c/0x1bc) [ 75.677673] [] (__migrate_hrtimers) from [] (generic_smp_call_function_single_interrupt+0x8c/0x104) [ 75.699645] [] (generic_smp_call_function_single_interrupt) from [] (handle_IPI+0xa4/0x16c) [ 75.706970] [] (handle_IPI) from [] (gic_handle_irq+0x54/0x5c) [ 75.715087] [] (gic_handle_irq) from [] (__irq_svc+0x44/0x5c) [ 75.725311] Exception stack(0xc08a3f58 to 0xc08a3fa0) I also backported your patches to Linux 3.10.y and found the same problem both in ARM and x86_64. However, I think I figured out the reason for those errors. Please, could you check the patch below (it applies on the top of your tree, branch isolate-cpusets) and let me know what you think? Thanks, Daniel -PATCH STARTS HERE- cpuset: quiesce: change irq disable/enable by irq save/restore The function __migrate_timers can be called under interrupt context or thread context depending on the core where the system call was executed. In case it executes under interrupt context, it seems a bad idea to leave interrupts enabled after migrating the timers. In fact, this caused kernel errors on the ARM architecture and on the x86_64 architecture with the 3.10 kernel (backported version of the cpuset-quiesce patch). Signed-off-by: Daniel Sangorrin Signed-off-by: Yoshitake Kobayashi --- kernel/hrtimer.c | 5 +++-- kernel/timer.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index e8cd1db..abb1707 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1701,8 +1701,9 @@ static void __migrate_hrtimers(int scpu, bool remove_pinned) struct hrtimer_clock_base *clock_base; unsigned int active_bases; int i; + unsigned long flags; - local_irq_disable(); + local_irq_save(flags); old_base = _cpu(hrtimer_bases, scpu); new_base = &__get_cpu_var(hrtimer_bases); /* @@ -1722,7 +1723,7 @@ static void __migrate_hrtimers(int scpu, bool remove_pinned) /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); - local_irq_enable(); + local_irq_restore(flags); } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_CPUSETS */ diff --git a/kernel/timer.c b/kernel/timer.c index 4676a07..2715b7d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1644,6 +1644,7 @@ static void __migrate_timers(int cpu, bool remove_pinned) struct tvec_base *old_base; struct tvec_base *new_base; int i; + unsigned long flags; old_base = per_cpu(tvec_bases, cpu); new_base = get_cpu_var(tvec_bases); @@ -1651,7 +1652,7 @@ static void __migrate_timers(int cpu, bool remove_pinned) * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(_base->lock); + spin_lock_irqsave(_base->lock, flags); spin_lock_nested(_base->lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base->running_timer); @@ -1671,7 +1672,7 @@ static void __migrate_timers(int cpu, bool remove_pinned) } spin_unlock(_base->lock); - spin_unlock_irq(_base->lock); + spin_unlock_irqrestore(_base->lock, flags); put_cpu_var(tvec_bases); } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_CPUSETS */ -- 1.8.1.1 -PATCH ENDS HERE- On 2014/03/20 22:48, Viresh Kumar wrote: > For networking platforms we need to provide one isolated CPU per user space > data > plane thread. These CPUs should not be interrupted by kernel at all unless > userspace has requested for some syscalls. And so must live in isolated mode. > Currently, there are background kernel activities that are running on almost > every CPU, like: timers/hrtimers/watchdogs/etc, and these a
Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce
Dear Viresh Kumar, I tried your set of patches for isolating particular CPU cores from unpinned timers. On x86_64 they were working fine, however I found out that on ARM they would fail under the following test: # mount -t cpuset none /cpuset # cd /cpuset # mkdir rt # cd rt # echo 1 cpus # echo 1 cpu_exclusive # cd # taskset 0x2 ./setquiesce.sh --- contains echo 1 /cpuset/rt/quiesce [ 75.622375] [ cut here ] [ 75.627258] WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:2595 __migrate_hrtimers+0x17c/0x1bc() [ 75.636840] DEBUG_LOCKS_WARN_ON(current-hardirq_context) [ 75.636840] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.14.0-rc1-37710-g23c8f02 #1 [ 75.649627] [c0014d18] (unwind_backtrace) from [c00119e8] (show_stack+0x10/0x14) [ 75.649627] [c00119e8] (show_stack) from [c065b61c] (dump_stack+0x78/0x94) [ 75.662689] [c065b61c] (dump_stack) from [c003e9a4] (warn_slowpath_common+0x60/0x84) [ 75.670410] [c003e9a4] (warn_slowpath_common) from [c003ea24] (warn_slowpath_fmt+0x30/0x40) [ 75.677673] [c003ea24] (warn_slowpath_fmt) from [c005d7b0] (__migrate_hrtimers+0x17c/0x1bc) [ 75.677673] [c005d7b0] (__migrate_hrtimers) from [c009e004] (generic_smp_call_function_single_interrupt+0x8c/0x104) [ 75.699645] [c009e004] (generic_smp_call_function_single_interrupt) from [c00134d0] (handle_IPI+0xa4/0x16c) [ 75.706970] [c00134d0] (handle_IPI) from [c0008614] (gic_handle_irq+0x54/0x5c) [ 75.715087] [c0008614] (gic_handle_irq) from [c0012624] (__irq_svc+0x44/0x5c) [ 75.725311] Exception stack(0xc08a3f58 to 0xc08a3fa0) I also backported your patches to Linux 3.10.y and found the same problem both in ARM and x86_64. However, I think I figured out the reason for those errors. Please, could you check the patch below (it applies on the top of your tree, branch isolate-cpusets) and let me know what you think? Thanks, Daniel -PATCH STARTS HERE- cpuset: quiesce: change irq disable/enable by irq save/restore The function __migrate_timers can be called under interrupt context or thread context depending on the core where the system call was executed. In case it executes under interrupt context, it seems a bad idea to leave interrupts enabled after migrating the timers. In fact, this caused kernel errors on the ARM architecture and on the x86_64 architecture with the 3.10 kernel (backported version of the cpuset-quiesce patch). Signed-off-by: Daniel Sangorrin daniel.sangor...@toshiba.co.jp Signed-off-by: Yoshitake Kobayashi yoshitake.kobaya...@toshiba.co.jp --- kernel/hrtimer.c | 5 +++-- kernel/timer.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index e8cd1db..abb1707 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1701,8 +1701,9 @@ static void __migrate_hrtimers(int scpu, bool remove_pinned) struct hrtimer_clock_base *clock_base; unsigned int active_bases; int i; + unsigned long flags; - local_irq_disable(); + local_irq_save(flags); old_base = per_cpu(hrtimer_bases, scpu); new_base = __get_cpu_var(hrtimer_bases); /* @@ -1722,7 +1723,7 @@ static void __migrate_hrtimers(int scpu, bool remove_pinned) /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); - local_irq_enable(); + local_irq_restore(flags); } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_CPUSETS */ diff --git a/kernel/timer.c b/kernel/timer.c index 4676a07..2715b7d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1644,6 +1644,7 @@ static void __migrate_timers(int cpu, bool remove_pinned) struct tvec_base *old_base; struct tvec_base *new_base; int i; + unsigned long flags; old_base = per_cpu(tvec_bases, cpu); new_base = get_cpu_var(tvec_bases); @@ -1651,7 +1652,7 @@ static void __migrate_timers(int cpu, bool remove_pinned) * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(new_base-lock); + spin_lock_irqsave(new_base-lock, flags); spin_lock_nested(old_base-lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base-running_timer); @@ -1671,7 +1672,7 @@ static void __migrate_timers(int cpu, bool remove_pinned) } spin_unlock(old_base-lock); - spin_unlock_irq(new_base-lock); + spin_unlock_irqrestore(new_base-lock, flags); put_cpu_var(tvec_bases); } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_CPUSETS */ -- 1.8.1.1 -PATCH ENDS HERE- On 2014/03/20 22:48, Viresh Kumar wrote: For networking platforms we need to provide one isolated CPU per user space data plane thread. These CPUs should not be interrupted by kernel at all unless userspace has requested for some syscalls. And so must live in isolated mode
[PATCH] timer_stats: add core information to event counters
Hi, This patch adds information specifying the CPU core where timer callbacks (events) were executed to the output of /proc/timer_stats. I tested the patch on a dual-core x86_64 and another dual-core ARM board. This patch may be useful for debugging timer execution in multi-core environments. Daniel Sangorrin (1): timer_stats: add core information to event counters Documentation/timers/timer_stats.txt | 41 kernel/time/timer_stats.c| 20 ++ 2 files changed, 34 insertions(+), 27 deletions(-) -- 1.8.1.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] timer_stats: add core information to event counters
Add information specifying the CPU core where timer callbacks (events) were executed to the output of /proc/timer_stats. Signed-off-by: Daniel Sangorrin daniel.sangor...@toshiba.co.jp Signed-off-by: Yoshitake Kobayashi yoshitake.kobaya...@toshiba.co.jp --- Documentation/timers/timer_stats.txt | 41 kernel/time/timer_stats.c| 20 ++ 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/Documentation/timers/timer_stats.txt b/Documentation/timers/timer_stats.txt index 8abd40b..e54077a 100644 --- a/Documentation/timers/timer_stats.txt +++ b/Documentation/timers/timer_stats.txt @@ -21,7 +21,7 @@ Linux system over a sample period: - the name of the process which initialized the timer - the function where the timer was initialized - the callback function which is associated to the timer -- the number of events (callbacks) +- the number of events (callbacks) executed per core timer_stats adds an entry to /proc: /proc/timer_stats @@ -45,23 +45,28 @@ readouts. Sample output of /proc/timer_stats: -Timerstats sample period: 3.888770 s - 12, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) - 15, 1 swapper hcd_submit_urb (rh_timer_func) - 4, 959 kedacschedule_timeout (process_timeout) - 1, 0 swapper page_writeback_init (wb_timer_fn) - 28, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) - 22, 2948 IRQ 4tty_flip_buffer_push (delayed_work_timer_fn) - 3, 3100 bash schedule_timeout (process_timeout) - 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) - 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) - 1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer) - 1, 2292 ip __netdev_watchdog_up (dev_watchdog) - 1,23 events/1 do_cache_clean (delayed_work_timer_fn) -90 total events, 30.0 events/sec - -The first column is the number of events, the second column the pid, the third -column is the name of the process. The forth column shows the function which +Timer Stats Version: v0.3 +Sample period: 4.365 s +Collection: inactive + 0, 4366, 0 swapper/1tick_setup_sched_timer (tick_sched_timer) + 4366, 0, 1 swapper/0tick_setup_sched_timer (tick_sched_timer) + 0, 5, 1132 sshd sk_reset_timer (tcp_write_timer) + 0, 4, 515 apache2 schedule_hrtimeout_range_clock (hrtimer_wakeup) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) + 0, 7, 0 swapper/1sk_reset_timer (tcp_delack_timer) + 3, 4, 0 swapper/0clocksource_watchdog (clocksource_watchdog) +0D, 3D,13 kworker/1:0 queue_delayed_work_on (delayed_work_timer_fn) +3D, 0D,22 kworker/0:1 queue_delayed_work_on (delayed_work_timer_fn) + 1, 0,22 kworker/0:1 e1000_watchdog_task (e1000_watchdog) +8842 total events, 2025.658 events/sec + +The first comma-separated columns represent the number of events (one +column per present core), the next column the pid, and the next one is the +name of the process. The last column shows the function which initialized the timer and in parenthesis the callback function which was executed on expiry. diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1fb08f2..53d1544 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -67,7 +67,7 @@ struct entry { /* * Number of timeout events: */ - unsigned long count; + unsigned long count[NR_CPUS]; unsigned inttimer_flag; /* @@ -203,7 +203,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) curr = alloc_entry(); if (curr) { *curr = *entry; - curr-count = 0; + memset(curr-count, 0, sizeof(curr-count)); curr-next = NULL; memcpy(curr-comm, comm, TASK_COMM_LEN); @@ -259,7 +259,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, entry = tstat_lookup(input, comm); if (likely(entry)) - entry-count++; + entry-count[raw_smp_processor_id()]++; else atomic_inc(overflow_count); @@ -284,7 +284,7 @@ static int tstats_show(struct seq_file *m, void *v) unsigned long ms; long events = 0; ktime_t time; - int i; + int i, cpu; mutex_lock(show_mutex
Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce
On 2014/04/24 16:43, Viresh Kumar wrote: On 24 April 2014 12:55, Daniel Sangorrin daniel.sangor...@toshiba.co.jp wrote: I tried your set of patches for isolating particular CPU cores from unpinned timers. On x86_64 they were working fine, however I found out that on ARM they would fail under the following test: I am happy that these drew attention from somebody Atleast :) Thanks to you for your hard work. # mount -t cpuset none /cpuset # cd /cpuset # mkdir rt # cd rt # echo 1 cpus # echo 1 cpu_exclusive # cd # taskset 0x2 ./setquiesce.sh --- contains echo 1 /cpuset/rt/quiesce [ 75.622375] [ cut here ] [ 75.627258] WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:2595 __migrate_hrtimers+0x17c/0x1bc() [ 75.636840] DEBUG_LOCKS_WARN_ON(current-hardirq_context) [ 75.636840] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.14.0-rc1-37710-g23c8f02 #1 [ 75.649627] [c0014d18] (unwind_backtrace) from [c00119e8] (show_stack+0x10/0x14) [ 75.649627] [c00119e8] (show_stack) from [c065b61c] (dump_stack+0x78/0x94) [ 75.662689] [c065b61c] (dump_stack) from [c003e9a4] (warn_slowpath_common+0x60/0x84) [ 75.670410] [c003e9a4] (warn_slowpath_common) from [c003ea24] (warn_slowpath_fmt+0x30/0x40) [ 75.677673] [c003ea24] (warn_slowpath_fmt) from [c005d7b0] (__migrate_hrtimers+0x17c/0x1bc) [ 75.677673] [c005d7b0] (__migrate_hrtimers) from [c009e004] (generic_smp_call_function_single_interrupt+0x8c/0x104) [ 75.699645] [c009e004] (generic_smp_call_function_single_interrupt) from [c00134d0] (handle_IPI+0xa4/0x16c) [ 75.706970] [c00134d0] (handle_IPI) from [c0008614] (gic_handle_irq+0x54/0x5c) [ 75.715087] [c0008614] (gic_handle_irq) from [c0012624] (__irq_svc+0x44/0x5c) [ 75.725311] Exception stack(0xc08a3f58 to 0xc08a3fa0) I couldn't understand why we went via a interrupt here ? Probably CPU1 was idle and was woken up with a IPI and then this happened. But in that case too, shouldn't the script run from process context instead ? In kernel/cpuset.c:quiesce_cpuset() you are using the function 'smp_call_function_any' which asks CPU cores in 'cpumask' to execute the functions 'hrtimer_quiesce_cpu' and 'timer_quiesce_cpu'. In the case above, 'cpumask' corresponds to core 0. Since I'm forcing the call to be executed from core 1 (by using taskset), an inter-processor interrupt is sent to core 0 for those functions to be executed. I also backported your patches to Linux 3.10.y and found the same problem both in ARM and x86_64. There are very few changes in between 3.10 and latest for timers/hrtimers and so things are expected to be the same. However, I think I figured out the reason for those errors. Please, could you check the patch below (it applies on the top of your tree, branch isolate-cpusets) and let me know what you think? Okay, just to let you know, I have also found some issues and they are now pushed in my tree.. Also it is rebased over 3.15-rc2 now. Ok, thank you! I see that you have already fixed the problem. I tested your tree on ARM and now it seems to work correctly. -PATCH STARTS HERE- cpuset: quiesce: change irq disable/enable by irq save/restore The function __migrate_timers can be called under interrupt context or thread context depending on the core where the system call was executed. In case it executes under interrupt context, it How exactly? See my reply above. seems a bad idea to leave interrupts enabled after migrating the timers. In fact, this caused kernel errors on the ARM architecture and on the x86_64 architecture with the 3.10 kernel (backported version of the cpuset-quiesce patch). I can't keep it as a separate patch and so would be required to merge it into my original patch.. Thanks for your inputs :) -- To unsubscribe from this list: send the line unsubscribe cgroups in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Thanks, Daniel -- Toshiba Corporate Software Engineering Center Daniel SANGORRIN E-mail: daniel.sangor...@toshiba.co.jp -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce
On 2014/04/24 17:41, Viresh Kumar wrote: On 24 April 2014 14:01, Daniel Sangorrin daniel.sangor...@toshiba.co.jp wrote: In kernel/cpuset.c:quiesce_cpuset() you are using the function 'smp_call_function_any' which asks CPU cores in 'cpumask' to execute the functions 'hrtimer_quiesce_cpu' and 'timer_quiesce_cpu'. In the case above, 'cpumask' corresponds to core 0. Since I'm forcing the call to be executed from core 1 (by using taskset), an inter-processor interrupt is sent to core 0 for those functions to be executed. Ahh, I understood that now :) .. So we are setting cpuset.quiesce from CPU1 which will do a IPI to get migrate_timers called on CPU0.. I was setting quiesce from CPU0 only in my tests :) But how does this work fine on x86 then? There we should have exactly same problem, isn't it? Yeah, I'm not sure why it is working on 3.15 x86_64 but not in 3.10 x86_64. Perhaps it's related to this patch: https://lkml.org/lkml/2013/9/19/349 Ok, thank you! I see that you have already fixed the problem. I tested your tree on ARM and now it seems to work correctly. Yeah, I just pushed your changes as well at the time I wrote last mail :) Oh, I see! Why didn't you just apply the patch on top of your tree so that the information included in the git commit (e.g: my name and mail) remains? This part: cpuset: quiesce: change irq disable/enable by irq save/restore The function __migrate_timers can be called under interrupt context or thread context depending on the core where the system call was executed. In case it executes under interrupt context, it seems a bad idea to leave interrupts enabled after migrating the timers. In fact, this caused kernel errors on the ARM architecture and on the x86_64 architecture with the 3.10 kernel (backported version of the cpuset-quiesce patch). Signed-off-by: Daniel Sangorrin daniel.sangor...@toshiba.co.jp Signed-off-by: Yoshitake Kobayashi yoshitake.kobaya...@toshiba.co.jp Thanks, Daniel -- Toshiba Corporate Software Engineering Center Daniel SANGORRIN E-mail: daniel.sangor...@toshiba.co.jp -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] timer_stats: add core information to event counters
On 2014/04/25 2:54, Randy Dunlap wrote: On 04/24/14 00:47, Daniel Sangorrin wrote: Add information specifying the CPU core where timer callbacks (events) were executed to the output of /proc/timer_stats. Signed-off-by: Daniel Sangorrin daniel.sangor...@toshiba.co.jp Signed-off-by: Yoshitake Kobayashi yoshitake.kobaya...@toshiba.co.jp --- Documentation/timers/timer_stats.txt | 41 kernel/time/timer_stats.c| 20 ++ 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/Documentation/timers/timer_stats.txt b/Documentation/timers/timer_stats.txt index 8abd40b..e54077a 100644 --- a/Documentation/timers/timer_stats.txt +++ b/Documentation/timers/timer_stats.txt @@ -21,7 +21,7 @@ Linux system over a sample period: - the name of the process which initialized the timer - the function where the timer was initialized - the callback function which is associated to the timer -- the number of events (callbacks) +- the number of events (callbacks) executed per core timer_stats adds an entry to /proc: /proc/timer_stats @@ -45,23 +45,28 @@ readouts. Sample output of /proc/timer_stats: -Timerstats sample period: 3.888770 s - 12, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) - 15, 1 swapper hcd_submit_urb (rh_timer_func) - 4, 959 kedacschedule_timeout (process_timeout) - 1, 0 swapper page_writeback_init (wb_timer_fn) - 28, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) - 22, 2948 IRQ 4tty_flip_buffer_push (delayed_work_timer_fn) - 3, 3100 bash schedule_timeout (process_timeout) - 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) - 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) - 1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer) - 1, 2292 ip __netdev_watchdog_up (dev_watchdog) - 1,23 events/1 do_cache_clean (delayed_work_timer_fn) -90 total events, 30.0 events/sec - -The first column is the number of events, the second column the pid, the third -column is the name of the process. The forth column shows the function which +Timer Stats Version: v0.3 +Sample period: 4.365 s +Collection: inactive + 0, 4366, 0 swapper/1tick_setup_sched_timer (tick_sched_timer) + 4366, 0, 1 swapper/0tick_setup_sched_timer (tick_sched_timer) + 0, 5, 1132 sshd sk_reset_timer (tcp_write_timer) + 0, 4, 515 apache2 schedule_hrtimeout_range_clock (hrtimer_wakeup) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) + 0, 7, 0 swapper/1sk_reset_timer (tcp_delack_timer) + 3, 4, 0 swapper/0clocksource_watchdog (clocksource_watchdog) +0D, 3D,13 kworker/1:0 queue_delayed_work_on (delayed_work_timer_fn) +3D, 0D,22 kworker/0:1 queue_delayed_work_on (delayed_work_timer_fn) + 1, 0,22 kworker/0:1 e1000_watchdog_task (e1000_watchdog) +8842 total events, 2025.658 events/sec + Is the number of events in decimal or hex? If in decimal, what do 0D and 3D mean? It is in decimal notation. The 'D' means 'deferrable timer' and it is already described in timer_stats.txt. +The first comma-separated columns represent the number of events (one +column per present core), the next column the pid, and the next one is the +name of the process. The last column shows the function which initialized the timer and in parenthesis the callback function which was parentheses Well spotted. I'll fix it and send it again. Thanks, Daniel -- Toshiba Corporate Software Engineering Center Daniel SANGORRIN E-mail: daniel.sangor...@toshiba.co.jp -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce
I can't keep it as a separate patch and so would be required to merge it into my original patch.. And the reason being: No patch is supposed to break things, otherwise git bisect wouldn't work smoothly.. And so git bisect would complain this issue after my patch and so I have to merge the fixes you gave into the original patch as its not yet merged. -- To unsubscribe from this list: send the line unsubscribe cgroups in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Ok, no problem then. Thanks, Daniel -- Toshiba Corporate Software Engineering Center Daniel SANGORRIN E-mail: daniel.sangor...@toshiba.co.jp -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2] timer_stats: add core information to event counters
Add information specifying the CPU core where timer callbacks (events) were executed to the output of /proc/timer_stats. Signed-off-by: Daniel Sangorrin daniel.sangor...@toshiba.co.jp Signed-off-by: Yoshitake Kobayashi yoshitake.kobaya...@toshiba.co.jp --- Documentation/timers/timer_stats.txt | 43 kernel/time/timer_stats.c| 20 + 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/Documentation/timers/timer_stats.txt b/Documentation/timers/timer_stats.txt index 8abd40b..aeb467f 100644 --- a/Documentation/timers/timer_stats.txt +++ b/Documentation/timers/timer_stats.txt @@ -21,7 +21,7 @@ Linux system over a sample period: - the name of the process which initialized the timer - the function where the timer was initialized - the callback function which is associated to the timer -- the number of events (callbacks) +- the number of events (callbacks) executed per core timer_stats adds an entry to /proc: /proc/timer_stats @@ -45,24 +45,29 @@ readouts. Sample output of /proc/timer_stats: -Timerstats sample period: 3.888770 s - 12, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) - 15, 1 swapper hcd_submit_urb (rh_timer_func) - 4, 959 kedacschedule_timeout (process_timeout) - 1, 0 swapper page_writeback_init (wb_timer_fn) - 28, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) - 22, 2948 IRQ 4tty_flip_buffer_push (delayed_work_timer_fn) - 3, 3100 bash schedule_timeout (process_timeout) - 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) - 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) - 1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer) - 1, 2292 ip __netdev_watchdog_up (dev_watchdog) - 1,23 events/1 do_cache_clean (delayed_work_timer_fn) -90 total events, 30.0 events/sec - -The first column is the number of events, the second column the pid, the third -column is the name of the process. The forth column shows the function which -initialized the timer and in parenthesis the callback function which was +Timer Stats Version: v0.3 +Sample period: 4.365 s +Collection: inactive + 0, 4366, 0 swapper/1tick_setup_sched_timer (tick_sched_timer) + 4366, 0, 1 swapper/0tick_setup_sched_timer (tick_sched_timer) + 0, 5, 1132 sshd sk_reset_timer (tcp_write_timer) + 0, 4, 515 apache2 schedule_hrtimeout_range_clock (hrtimer_wakeup) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) +16, 0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func) + 0, 7, 0 swapper/1sk_reset_timer (tcp_delack_timer) + 3, 4, 0 swapper/0clocksource_watchdog (clocksource_watchdog) +0D, 3D,13 kworker/1:0 queue_delayed_work_on (delayed_work_timer_fn) +3D, 0D,22 kworker/0:1 queue_delayed_work_on (delayed_work_timer_fn) + 1, 0,22 kworker/0:1 e1000_watchdog_task (e1000_watchdog) +8842 total events, 2025.658 events/sec + +The first comma-separated columns represent the number of events (one +column per present core), the next column the pid, and the next one is the +name of the process. The last column shows the function which +initialized the timer and in parentheses the callback function which was executed on expiry. Thomas, Ingo diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1fb08f2..53d1544 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -67,7 +67,7 @@ struct entry { /* * Number of timeout events: */ - unsigned long count; + unsigned long count[NR_CPUS]; unsigned inttimer_flag; /* @@ -203,7 +203,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) curr = alloc_entry(); if (curr) { *curr = *entry; - curr-count = 0; + memset(curr-count, 0, sizeof(curr-count)); curr-next = NULL; memcpy(curr-comm, comm, TASK_COMM_LEN); @@ -259,7 +259,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, entry = tstat_lookup(input, comm); if (likely(entry)) - entry-count++; + entry-count[raw_smp_processor_id()]++; else atomic_inc(overflow_count); @@ -284,7 +284,7 @@ static int tstats_show(struct seq_file *m, void *v) unsigned long ms; long events = 0
Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce
On 2014/04/25 13:51, Viresh Kumar wrote: On 25 April 2014 06:01, Daniel Sangorrin daniel.sangor...@toshiba.co.jp wrote: I can't keep it as a separate patch and so would be required to merge it into my original patch.. And the reason being: No patch is supposed to break things, otherwise git bisect wouldn't work smoothly.. And so git bisect would complain this issue after my patch and so I have to merge the fixes you gave into the original patch as its not yet merged. -- To unsubscribe from this list: send the line unsubscribe cgroups in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Ok, no problem then. Do you want me to add your Tested-by for my next version ? Sure. Tested-by: Daniel Sangorrin daniel.sangor...@toshiba.co.jp -- Toshiba Corporate Software Engineering Center Daniel SANGORRIN E-mail: daniel.sangor...@toshiba.co.jp -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/