from:"Daniel Sangorrin"

[PATCH v4 4.4 3/4] x86/fpu: Remove struct fpu::counter

2018-10-16 Thread Daniel Sangorrin

From: Rik van Riel 

commit 3913cc3507575273beb165a5e027a081913ed507 upstream.

With the lazy FPU code gone, we no longer use the counter field
in struct fpu for anything. Get rid it.

Signed-off-by: Rik van Riel 
Reviewed-by: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Fenghua Yu 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Quentin Casasnovas 
Cc: Thomas Gleixner 
Cc: pbonz...@redhat.com
Link: http://lkml.kernel.org/r/1475627678-20788-6-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
Signed-off-by: Daniel Sangorrin 
---
 arch/x86/include/asm/fpu/internal.h |  3 ---
 arch/x86/include/asm/fpu/types.h| 11 ---
 arch/x86/kernel/fpu/core.c  |  3 ---
 3 files changed, 17 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 2ff03cc..16825dd 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -575,15 +575,12 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu 
*new_fpu, int cpu)
 
/* Don't change CR0.TS if we just switch! */
if (fpu.preload) {
-   new_fpu->counter++;
__fpregs_activate(new_fpu);
prefetch(_fpu->state);
}
} else {
-   old_fpu->counter = 0;
old_fpu->last_cpu = -1;
if (fpu.preload) {
-   new_fpu->counter++;
if (fpu_want_lazy_restore(new_fpu, cpu))
fpu.preload = 0;
else
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 1c6f6ac..dcb85a9 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -303,17 +303,6 @@ struct fpu {
unsigned char   fpregs_active;
 
/*
-* @counter:
-*
-* This counter contains the number of consecutive context switches
-* during which the FPU stays used. If this is over a threshold, the
-* lazy FPU restore logic becomes eager, to save the trap overhead.
-* This is an unsigned char so that after 256 iterations the counter
-* wraps and the context switch behavior turns lazy again; this is to
-* deal with bursty apps that only use the FPU for a short time:
-*/
-   unsigned char   counter;
-   /*
 * @state:
 *
 * In-memory copy of all FPU registers that we save/restore
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index b282364..b322325 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -250,7 +250,6 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu 
*src_fpu)
 
 int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
-   dst_fpu->counter = 0;
dst_fpu->fpregs_active = 0;
dst_fpu->last_cpu = -1;
 
@@ -353,7 +352,6 @@ void fpu__restore(struct fpu *fpu)
kernel_fpu_disable();
fpregs_activate(fpu);
copy_kernel_to_fpregs(>state);
-   fpu->counter++;
kernel_fpu_enable();
 }
 EXPORT_SYMBOL_GPL(fpu__restore);
@@ -370,7 +368,6 @@ EXPORT_SYMBOL_GPL(fpu__restore);
 void fpu__drop(struct fpu *fpu)
 {
preempt_disable();
-   fpu->counter = 0;
 
if (fpu->fpregs_active) {
/* Ignore delayed exceptions from user space */
-- 
2.7.4

[PATCH v4 4.4 3/4] x86/fpu: Remove struct fpu::counter

2018-10-16 Thread Daniel Sangorrin

From: Rik van Riel 

commit 3913cc3507575273beb165a5e027a081913ed507 upstream.

With the lazy FPU code gone, we no longer use the counter field
in struct fpu for anything. Get rid it.

Signed-off-by: Rik van Riel 
Reviewed-by: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Fenghua Yu 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Quentin Casasnovas 
Cc: Thomas Gleixner 
Cc: pbonz...@redhat.com
Link: http://lkml.kernel.org/r/1475627678-20788-6-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
Signed-off-by: Daniel Sangorrin 
---
 arch/x86/include/asm/fpu/internal.h |  3 ---
 arch/x86/include/asm/fpu/types.h| 11 ---
 arch/x86/kernel/fpu/core.c  |  3 ---
 3 files changed, 17 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 2ff03cc..16825dd 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -575,15 +575,12 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu 
*new_fpu, int cpu)
 
/* Don't change CR0.TS if we just switch! */
if (fpu.preload) {
-   new_fpu->counter++;
__fpregs_activate(new_fpu);
prefetch(_fpu->state);
}
} else {
-   old_fpu->counter = 0;
old_fpu->last_cpu = -1;
if (fpu.preload) {
-   new_fpu->counter++;
if (fpu_want_lazy_restore(new_fpu, cpu))
fpu.preload = 0;
else
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 1c6f6ac..dcb85a9 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -303,17 +303,6 @@ struct fpu {
unsigned char   fpregs_active;
 
/*
-* @counter:
-*
-* This counter contains the number of consecutive context switches
-* during which the FPU stays used. If this is over a threshold, the
-* lazy FPU restore logic becomes eager, to save the trap overhead.
-* This is an unsigned char so that after 256 iterations the counter
-* wraps and the context switch behavior turns lazy again; this is to
-* deal with bursty apps that only use the FPU for a short time:
-*/
-   unsigned char   counter;
-   /*
 * @state:
 *
 * In-memory copy of all FPU registers that we save/restore
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index b282364..b322325 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -250,7 +250,6 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu 
*src_fpu)
 
 int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
-   dst_fpu->counter = 0;
dst_fpu->fpregs_active = 0;
dst_fpu->last_cpu = -1;
 
@@ -353,7 +352,6 @@ void fpu__restore(struct fpu *fpu)
kernel_fpu_disable();
fpregs_activate(fpu);
copy_kernel_to_fpregs(>state);
-   fpu->counter++;
kernel_fpu_enable();
 }
 EXPORT_SYMBOL_GPL(fpu__restore);
@@ -370,7 +368,6 @@ EXPORT_SYMBOL_GPL(fpu__restore);
 void fpu__drop(struct fpu *fpu)
 {
preempt_disable();
-   fpu->counter = 0;
 
if (fpu->fpregs_active) {
/* Ignore delayed exceptions from user space */
-- 
2.7.4

[PATCH v4 4.4 1/4] KVM: x86: remove eager_fpu field of struct kvm_vcpu_arch

2018-10-16 Thread Daniel Sangorrin

From: Paolo Bonzini 

commit 5a5fbdc0e3f1159a734f1890da60fce70e98271d upstream.

It is now equal to use_eager_fpu(), which simply tests a cpufeature bit.

Signed-off-by: Paolo Bonzini 
Signed-off-by: Daniel Sangorrin 
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/cpuid.c| 3 +--
 arch/x86/kvm/x86.c  | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 74fda1a..3a37cdb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -439,7 +439,6 @@ struct kvm_vcpu_arch {
struct kvm_mmu_memory_cache mmu_page_header_cache;
 
struct fpu guest_fpu;
-   bool eager_fpu;
u64 xcr0;
u64 guest_supported_xcr0;
u32 guest_xstate_size;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 83d6369..7b4ea5e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -104,8 +104,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
if (best && (best->eax & (F(XSAVES) | F(XSAVEC
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
-   vcpu->arch.eager_fpu = use_eager_fpu();
-   if (vcpu->arch.eager_fpu)
+   if (use_eager_fpu())
kvm_x86_ops->fpu_activate(vcpu);
 
/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 53d43d2..660d8a0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7325,7 +7325,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 * Every 255 times fpu_counter rolls over to 0; a guest that uses
 * the FPU in bursts will revert to loading it on demand.
 */
-   if (!vcpu->arch.eager_fpu) {
+   if (!use_eager_fpu()) {
if (++vcpu->fpu_counter < 5)
kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
}
-- 
2.7.4

[PATCH v4 4.4 1/4] KVM: x86: remove eager_fpu field of struct kvm_vcpu_arch

2018-10-16 Thread Daniel Sangorrin

From: Paolo Bonzini 

commit 5a5fbdc0e3f1159a734f1890da60fce70e98271d upstream.

It is now equal to use_eager_fpu(), which simply tests a cpufeature bit.

Signed-off-by: Paolo Bonzini 
Signed-off-by: Daniel Sangorrin 
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/cpuid.c| 3 +--
 arch/x86/kvm/x86.c  | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 74fda1a..3a37cdb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -439,7 +439,6 @@ struct kvm_vcpu_arch {
struct kvm_mmu_memory_cache mmu_page_header_cache;
 
struct fpu guest_fpu;
-   bool eager_fpu;
u64 xcr0;
u64 guest_supported_xcr0;
u32 guest_xstate_size;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 83d6369..7b4ea5e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -104,8 +104,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
if (best && (best->eax & (F(XSAVES) | F(XSAVEC
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
-   vcpu->arch.eager_fpu = use_eager_fpu();
-   if (vcpu->arch.eager_fpu)
+   if (use_eager_fpu())
kvm_x86_ops->fpu_activate(vcpu);
 
/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 53d43d2..660d8a0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7325,7 +7325,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 * Every 255 times fpu_counter rolls over to 0; a guest that uses
 * the FPU in bursts will revert to loading it on demand.
 */
-   if (!vcpu->arch.eager_fpu) {
+   if (!use_eager_fpu()) {
if (++vcpu->fpu_counter < 5)
kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
}
-- 
2.7.4

[PATCH v3 4.4.y 2/3] x86/fpu: Remove struct fpu::counter

2018-07-10 Thread Daniel Sangorrin

From: Rik van Riel 

commit 3913cc3507575273beb165a5e027a081913ed507 upstream

With the lazy FPU code gone, we no longer use the counter field
in struct fpu for anything. Get rid it.

Signed-off-by: Rik van Riel 
Reviewed-by: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Fenghua Yu 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Quentin Casasnovas 
Cc: Thomas Gleixner 
Cc: pbonz...@redhat.com
Link: http://lkml.kernel.org/r/1475627678-20788-6-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
Signed-off-by: Daniel Sangorrin 
---
 arch/x86/include/asm/fpu/internal.h |  3 ---
 arch/x86/include/asm/fpu/types.h| 11 ---
 arch/x86/kernel/fpu/core.c  |  3 ---
 3 files changed, 17 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 7796e04..c8ba9b8 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -581,15 +581,12 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu 
*new_fpu, int cpu)
 
/* Don't change CR0.TS if we just switch! */
if (fpu.preload) {
-   new_fpu->counter++;
__fpregs_activate(new_fpu);
prefetch(_fpu->state);
}
} else {
-   old_fpu->counter = 0;
old_fpu->last_cpu = -1;
if (fpu.preload) {
-   new_fpu->counter++;
if (fpu_want_lazy_restore(new_fpu, cpu))
fpu.preload = 0;
else
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 1c6f6ac..dcb85a9 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -303,17 +303,6 @@ struct fpu {
unsigned char   fpregs_active;
 
/*
-* @counter:
-*
-* This counter contains the number of consecutive context switches
-* during which the FPU stays used. If this is over a threshold, the
-* lazy FPU restore logic becomes eager, to save the trap overhead.
-* This is an unsigned char so that after 256 iterations the counter
-* wraps and the context switch behavior turns lazy again; this is to
-* deal with bursty apps that only use the FPU for a short time:
-*/
-   unsigned char   counter;
-   /*
 * @state:
 *
 * In-memory copy of all FPU registers that we save/restore
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index b282364..b322325 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -250,7 +250,6 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu 
*src_fpu)
 
 int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
-   dst_fpu->counter = 0;
dst_fpu->fpregs_active = 0;
dst_fpu->last_cpu = -1;
 
@@ -353,7 +352,6 @@ void fpu__restore(struct fpu *fpu)
kernel_fpu_disable();
fpregs_activate(fpu);
copy_kernel_to_fpregs(>state);
-   fpu->counter++;
kernel_fpu_enable();
 }
 EXPORT_SYMBOL_GPL(fpu__restore);
@@ -370,7 +368,6 @@ EXPORT_SYMBOL_GPL(fpu__restore);
 void fpu__drop(struct fpu *fpu)
 {
preempt_disable();
-   fpu->counter = 0;
 
if (fpu->fpregs_active) {
/* Ignore delayed exceptions from user space */
-- 
2.1.4

[PATCH v3 4.4.y 2/3] x86/fpu: Remove struct fpu::counter

2018-07-10 Thread Daniel Sangorrin

From: Rik van Riel 

commit 3913cc3507575273beb165a5e027a081913ed507 upstream

With the lazy FPU code gone, we no longer use the counter field
in struct fpu for anything. Get rid it.

Signed-off-by: Rik van Riel 
Reviewed-by: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Fenghua Yu 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Quentin Casasnovas 
Cc: Thomas Gleixner 
Cc: pbonz...@redhat.com
Link: http://lkml.kernel.org/r/1475627678-20788-6-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
Signed-off-by: Daniel Sangorrin 
---
 arch/x86/include/asm/fpu/internal.h |  3 ---
 arch/x86/include/asm/fpu/types.h| 11 ---
 arch/x86/kernel/fpu/core.c  |  3 ---
 3 files changed, 17 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 7796e04..c8ba9b8 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -581,15 +581,12 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu 
*new_fpu, int cpu)
 
/* Don't change CR0.TS if we just switch! */
if (fpu.preload) {
-   new_fpu->counter++;
__fpregs_activate(new_fpu);
prefetch(_fpu->state);
}
} else {
-   old_fpu->counter = 0;
old_fpu->last_cpu = -1;
if (fpu.preload) {
-   new_fpu->counter++;
if (fpu_want_lazy_restore(new_fpu, cpu))
fpu.preload = 0;
else
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 1c6f6ac..dcb85a9 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -303,17 +303,6 @@ struct fpu {
unsigned char   fpregs_active;
 
/*
-* @counter:
-*
-* This counter contains the number of consecutive context switches
-* during which the FPU stays used. If this is over a threshold, the
-* lazy FPU restore logic becomes eager, to save the trap overhead.
-* This is an unsigned char so that after 256 iterations the counter
-* wraps and the context switch behavior turns lazy again; this is to
-* deal with bursty apps that only use the FPU for a short time:
-*/
-   unsigned char   counter;
-   /*
 * @state:
 *
 * In-memory copy of all FPU registers that we save/restore
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index b282364..b322325 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -250,7 +250,6 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu 
*src_fpu)
 
 int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
-   dst_fpu->counter = 0;
dst_fpu->fpregs_active = 0;
dst_fpu->last_cpu = -1;
 
@@ -353,7 +352,6 @@ void fpu__restore(struct fpu *fpu)
kernel_fpu_disable();
fpregs_activate(fpu);
copy_kernel_to_fpregs(>state);
-   fpu->counter++;
kernel_fpu_enable();
 }
 EXPORT_SYMBOL_GPL(fpu__restore);
@@ -370,7 +368,6 @@ EXPORT_SYMBOL_GPL(fpu__restore);
 void fpu__drop(struct fpu *fpu)
 {
preempt_disable();
-   fpu->counter = 0;
 
if (fpu->fpregs_active) {
/* Ignore delayed exceptions from user space */
-- 
2.1.4

[PATCH v3 4.4.y 1/3] x86/fpu: Remove use_eager_fpu()

2018-07-10 Thread Daniel Sangorrin

From: Andy Lutomirski 

commit c592b57347069abfc0dcad3b3a302cf882602597 upstream

This removes all the obvious code paths that depend on lazy FPU mode.
It shouldn't change the generated code at all.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Rik van Riel 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Fenghua Yu 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Quentin Casasnovas 
Cc: Thomas Gleixner 
Cc: pbonz...@redhat.com
Link: http://lkml.kernel.org/r/1475627678-20788-5-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
Signed-off-by: Daniel Sangorrin 
---
 arch/x86/crypto/crc32c-intel_glue.c | 17 -
 arch/x86/include/asm/fpu/internal.h | 34 +
 arch/x86/kernel/fpu/core.c  | 38 +
 arch/x86/kernel/fpu/signal.c|  8 +++-
 arch/x86/kvm/cpuid.c|  4 +---
 arch/x86/kvm/x86.c  | 10 --
 6 files changed, 14 insertions(+), 97 deletions(-)

diff --git a/arch/x86/crypto/crc32c-intel_glue.c 
b/arch/x86/crypto/crc32c-intel_glue.c
index 15f5c76..d610c11 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -48,21 +48,13 @@
 #ifdef CONFIG_X86_64
 /*
  * use carryless multiply version of crc32c when buffer
- * size is >= 512 (when eager fpu is enabled) or
- * >= 1024 (when eager fpu is disabled) to account
+ * size is >= 512 to account
  * for fpu state save/restore overhead.
  */
-#define CRC32C_PCL_BREAKEVEN_EAGERFPU  512
-#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU1024
+#define CRC32C_PCL_BREAKEVEN   512
 
 asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
unsigned int crc_init);
-static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU;
-#define set_pcl_breakeven_point()  \
-do {   \
-   if (!use_eager_fpu())   \
-   crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \
-} while (0)
 #endif /* CONFIG_X86_64 */
 
 static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t 
length)
@@ -185,7 +177,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, 
const u8 *data,
 * use faster PCL version if datasize is large enough to
 * overcome kernel fpu state save/restore overhead
 */
-   if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
+   if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) {
kernel_fpu_begin();
*crcp = crc_pcl(data, len, *crcp);
kernel_fpu_end();
@@ -197,7 +189,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, 
const u8 *data,
 static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int 
len,
u8 *out)
 {
-   if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
+   if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) {
kernel_fpu_begin();
*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
kernel_fpu_end();
@@ -256,7 +248,6 @@ static int __init crc32c_intel_mod_init(void)
alg.update = crc32c_pcl_intel_update;
alg.finup = crc32c_pcl_intel_finup;
alg.digest = crc32c_pcl_intel_digest;
-   set_pcl_breakeven_point();
}
 #endif
return crypto_register_shash();
diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 146d838..7796e04 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -56,11 +56,6 @@ extern u64 fpu__get_supported_xfeatures_mask(void);
 /*
  * FPU related CPU feature flag helper routines:
  */
-static __always_inline __pure bool use_eager_fpu(void)
-{
-   return true;
-}
-
 static __always_inline __pure bool use_xsaveopt(void)
 {
return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
@@ -504,24 +499,6 @@ static inline int fpu_want_lazy_restore(struct fpu *fpu, 
unsigned int cpu)
 }
 
 
-/*
- * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation'
- * idiom, which is then paired with the sw-flag (fpregs_active) later on:
- */
-
-static inline void __fpregs_activate_hw(void)
-{
-   if (!use_eager_fpu())
-   clts();
-}
-
-static inline void __fpregs_deactivate_hw(void)
-{
-   if (!use_eager_fpu())
-   stts();
-}
-
-/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */
 static inline void __fpregs_deactivate(struct fpu *fpu)
 {
WARN_ON_FPU(!fpu->fpregs_active);
@@ -530,7 +507,6 @@ static inline void __fpregs_deactivate(struct fpu *fpu)
this_cpu_write(fpu_fpregs_owner_ctx, NULL);
 }
 
-/* Mu

[PATCH v3 4.4.y 1/3] x86/fpu: Remove use_eager_fpu()

2018-07-10 Thread Daniel Sangorrin

From: Andy Lutomirski 

commit c592b57347069abfc0dcad3b3a302cf882602597 upstream

This removes all the obvious code paths that depend on lazy FPU mode.
It shouldn't change the generated code at all.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Rik van Riel 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Fenghua Yu 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Quentin Casasnovas 
Cc: Thomas Gleixner 
Cc: pbonz...@redhat.com
Link: http://lkml.kernel.org/r/1475627678-20788-5-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
Signed-off-by: Daniel Sangorrin 
---
 arch/x86/crypto/crc32c-intel_glue.c | 17 -
 arch/x86/include/asm/fpu/internal.h | 34 +
 arch/x86/kernel/fpu/core.c  | 38 +
 arch/x86/kernel/fpu/signal.c|  8 +++-
 arch/x86/kvm/cpuid.c|  4 +---
 arch/x86/kvm/x86.c  | 10 --
 6 files changed, 14 insertions(+), 97 deletions(-)

diff --git a/arch/x86/crypto/crc32c-intel_glue.c 
b/arch/x86/crypto/crc32c-intel_glue.c
index 15f5c76..d610c11 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -48,21 +48,13 @@
 #ifdef CONFIG_X86_64
 /*
  * use carryless multiply version of crc32c when buffer
- * size is >= 512 (when eager fpu is enabled) or
- * >= 1024 (when eager fpu is disabled) to account
+ * size is >= 512 to account
  * for fpu state save/restore overhead.
  */
-#define CRC32C_PCL_BREAKEVEN_EAGERFPU  512
-#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU1024
+#define CRC32C_PCL_BREAKEVEN   512
 
 asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
unsigned int crc_init);
-static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU;
-#define set_pcl_breakeven_point()  \
-do {   \
-   if (!use_eager_fpu())   \
-   crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \
-} while (0)
 #endif /* CONFIG_X86_64 */
 
 static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t 
length)
@@ -185,7 +177,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, 
const u8 *data,
 * use faster PCL version if datasize is large enough to
 * overcome kernel fpu state save/restore overhead
 */
-   if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
+   if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) {
kernel_fpu_begin();
*crcp = crc_pcl(data, len, *crcp);
kernel_fpu_end();
@@ -197,7 +189,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, 
const u8 *data,
 static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int 
len,
u8 *out)
 {
-   if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
+   if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) {
kernel_fpu_begin();
*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
kernel_fpu_end();
@@ -256,7 +248,6 @@ static int __init crc32c_intel_mod_init(void)
alg.update = crc32c_pcl_intel_update;
alg.finup = crc32c_pcl_intel_finup;
alg.digest = crc32c_pcl_intel_digest;
-   set_pcl_breakeven_point();
}
 #endif
return crypto_register_shash();
diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 146d838..7796e04 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -56,11 +56,6 @@ extern u64 fpu__get_supported_xfeatures_mask(void);
 /*
  * FPU related CPU feature flag helper routines:
  */
-static __always_inline __pure bool use_eager_fpu(void)
-{
-   return true;
-}
-
 static __always_inline __pure bool use_xsaveopt(void)
 {
return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
@@ -504,24 +499,6 @@ static inline int fpu_want_lazy_restore(struct fpu *fpu, 
unsigned int cpu)
 }
 
 
-/*
- * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation'
- * idiom, which is then paired with the sw-flag (fpregs_active) later on:
- */
-
-static inline void __fpregs_activate_hw(void)
-{
-   if (!use_eager_fpu())
-   clts();
-}
-
-static inline void __fpregs_deactivate_hw(void)
-{
-   if (!use_eager_fpu())
-   stts();
-}
-
-/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */
 static inline void __fpregs_deactivate(struct fpu *fpu)
 {
WARN_ON_FPU(!fpu->fpregs_active);
@@ -530,7 +507,6 @@ static inline void __fpregs_deactivate(struct fpu *fpu)
this_cpu_write(fpu_fpregs_owner_ctx, NULL);
 }
 
-/* Mu

RE: [PATCH 4.4 00/24] 4.4.138-stable review

2018-06-27 Thread Daniel Sangorrin

> -Original Message-
> From: stable-ow...@vger.kernel.org  On Behalf 
> Of Ben Hutchings
[..]
> 3.18 and 4.4 are still missing this important fix to early parameter
> parsing:
> 
> commit 02afeaae9843733a39cd9b11053748b2d1dc5ae7
> Author: Dave Hansen 
> Date:   Tue Dec 22 14:52:38 2015 -0800
> 
> x86/boot: Fix early command-line parsing when matching at end

I have cherry-picked that commit into both 3.18.y and 4.4.y (it applies 
cleanly) and tested them on my machine. Both worked correctly.

Test method: 
- Added printks to the functions that detect noxsave, noxsaves and noxsaveopt
- Booted both kernels with and without the commit, and adding the kernel 
parameter "noxsave"
- Checked that "noxsaves" and "noxsaveopt" do not appear on dmesg anymore after 
the commit.

Thanks,
Daniel

RE: [PATCH 4.4 00/24] 4.4.138-stable review

2018-06-27 Thread Daniel Sangorrin

> -Original Message-
> From: stable-ow...@vger.kernel.org  On Behalf 
> Of Ben Hutchings
[..]
> 3.18 and 4.4 are still missing this important fix to early parameter
> parsing:
> 
> commit 02afeaae9843733a39cd9b11053748b2d1dc5ae7
> Author: Dave Hansen 
> Date:   Tue Dec 22 14:52:38 2015 -0800
> 
> x86/boot: Fix early command-line parsing when matching at end

I have cherry-picked that commit into both 3.18.y and 4.4.y (it applies 
cleanly) and tested them on my machine. Both worked correctly.

Test method: 
- Added printks to the functions that detect noxsave, noxsaves and noxsaveopt
- Booted both kernels with and without the commit, and adding the kernel 
parameter "noxsave"
- Checked that "noxsaves" and "noxsaveopt" do not appear on dmesg anymore after 
the commit.

Thanks,
Daniel

RE: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode

2018-06-15 Thread Daniel Sangorrin

> -Original Message-
> From: 'Greg Kroah-Hartman' [mailto:gre...@linuxfoundation.org]
> Sent: Friday, June 15, 2018 4:06 PM
> To: Daniel Sangorrin 
> Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; 'Andy Lutomirski'
> ; 'Rik van Riel' ; 'Borislav Petkov'
> ; 'Brian Gerst' ; 'Dave Hansen'
> ; 'Denys Vlasenko' ;
> 'Fenghua Yu' ; 'H. Peter Anvin' ; 'Josh
> Poimboeuf' ; 'Linus Torvalds'
> ; 'Oleg Nesterov' ; 'Peter
> Zijlstra' ; 'Quentin Casasnovas'
> ; 'Thomas Gleixner' ;
> pbonz...@redhat.com; 'Ingo Molnar' 
> Subject: Re: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode
> 
> On Fri, Jun 15, 2018 at 02:23:08PM +0900, Daniel Sangorrin wrote:
> > > -Original Message-
> > > From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org]
> On
> > > Behalf Of 'Greg Kroah-Hartman'
> > > Sent: Friday, June 15, 2018 1:56 PM
> > > To: Daniel Sangorrin 
> > > Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; 'Andy 
> > > Lutomirski'
> > > ; 'Rik van Riel' ; 'Borislav Petkov'
> > > ; 'Brian Gerst' ; 'Dave Hansen'
> > > ; 'Denys Vlasenko' ;
> > > 'Fenghua Yu' ; 'H. Peter Anvin' ;
> 'Josh
> > > Poimboeuf' ; 'Linus Torvalds'
> > > ; 'Oleg Nesterov' ; 'Peter
> > > Zijlstra' ; 'Quentin Casasnovas'
> > > ; 'Thomas Gleixner' ;
> > > pbonz...@redhat.com; 'Ingo Molnar' 
> > > Subject: Re: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode
> > >
> > > On Fri, Jun 15, 2018 at 01:24:27PM +0900, Daniel Sangorrin wrote:
> > > > Hi Greg,
> > > >
> > > > >  /* Intel-defined CPU features, CPUID level 0x0001 (ecx), word 4 
> > > > > */
> > > > > --- a/arch/x86/include/asm/fpu/internal.h
> > > > > +++ b/arch/x86/include/asm/fpu/internal.h
> > > > > @@ -58,7 +58,7 @@ extern u64 fpu__get_supported_xfeatures_
> > > > >   */
> > > > >  static __always_inline __pure bool use_eager_fpu(void)
> > > > >  {
> > > > > - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
> > > > > + return true;
> > > > >  }
> > > >
> > > > Since this function returns always true then we can remove the code
> depending on
> > > lazy FPU mode.
> > > > Actually this has already been done in "x86/fpu: Remove use_eager_fpu()"
> > > > Ref: https://patchwork.kernel.org/patch/9365883/
> > > >
> > > > >  static void __init fpu__init_parse_early_param(void)
> > > > >  {
> > > > > - if (cmdline_find_option_bool(boot_command_line,
> "eagerfpu=off")) {
> > > > > - eagerfpu = DISABLE;
> > > > > - fpu__clear_eager_fpu_features();
> > > > > - }
> > > >
> > > > Since this patch removes the kernel boot parameter "eagerfpu", maybe we
> should
> > > remove it from the Documentation.
> > > > This has also been done by commit "x86/fpu: Finish excising 'eagerfpu'"
> > > > Ref: https://patchwork.kernel.org/patch/9380673/
> > > >
> > > > I will try backporting those patches unless anyone has any objections.
> > >
> > > What are the git commit ids of those patches in Linus's tree?  No need
> > > to point to patchwork links, I don't use that tool.
> >
> > OK, I got it.
> >
> > "x86/fpu: Remove use_eager_fpu()":
> c592b57347069abfc0dcad3b3a302cf882602597
> > "x86/fpu: Finish excising 'eagerfpu'":
> e63650840e8b053aa09ad934877e87e9941ed135
> 
> Minor nit.  For kernel commits, the "normal" way we reference them looks
> like this:
>   c592b5734706 ("x86/fpu: Remove use_eager_fpu()")
>   e63650840e8b ("x86/fpu: Finish excising 'eagerfpu'")
> 
> Which makes it much easier to read and understand.  That's what we use
> for the "Fixes:" tag in commits and in other places (text in commit
> messages.)
> 
> To automatically generate that format, you can just do:
>   git show -s --abbrev-commit --abbrev=12 --pretty=format:"%h
> (\"%s\")%n"
> 
> I recommend just setting up an alias for the above line, otherwise it's
> a pain to have to remember how to do it all the time.  Here's what I do:
>   $ alias gsr='git show -s --abbrev-commit --abbrev=12 --pretty=format:"%h
> (\"%s\")%n"'
>   $ gsr c592b57347069abfc0dcad3b3a302cf882602597
>   c592b5734706 ("x86/fpu: Remove use_eager_fpu()")
> 

Thanks a lot for the detailed explanation.

> > Unfortunately, they don't apply cleanly to stable kernels.
> 
> Should be very simple to backport if you want to.  Also I need copies
> for the 4.9.y tree as well if you do so.

OK, I will do it after the OSS Japan next week if you don't mind. I also want 
to run some tests for the FPU and try different boot parameter combinations 
(e.g. no387, nofxsr etc) to check I didn't break anything.

Thanks,
Daniel



> 
> thanks,
> 
> greg k-h

RE: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode

2018-06-15 Thread Daniel Sangorrin

> -Original Message-
> From: 'Greg Kroah-Hartman' [mailto:gre...@linuxfoundation.org]
> Sent: Friday, June 15, 2018 4:06 PM
> To: Daniel Sangorrin 
> Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; 'Andy Lutomirski'
> ; 'Rik van Riel' ; 'Borislav Petkov'
> ; 'Brian Gerst' ; 'Dave Hansen'
> ; 'Denys Vlasenko' ;
> 'Fenghua Yu' ; 'H. Peter Anvin' ; 'Josh
> Poimboeuf' ; 'Linus Torvalds'
> ; 'Oleg Nesterov' ; 'Peter
> Zijlstra' ; 'Quentin Casasnovas'
> ; 'Thomas Gleixner' ;
> pbonz...@redhat.com; 'Ingo Molnar' 
> Subject: Re: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode
> 
> On Fri, Jun 15, 2018 at 02:23:08PM +0900, Daniel Sangorrin wrote:
> > > -Original Message-
> > > From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org]
> On
> > > Behalf Of 'Greg Kroah-Hartman'
> > > Sent: Friday, June 15, 2018 1:56 PM
> > > To: Daniel Sangorrin 
> > > Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; 'Andy 
> > > Lutomirski'
> > > ; 'Rik van Riel' ; 'Borislav Petkov'
> > > ; 'Brian Gerst' ; 'Dave Hansen'
> > > ; 'Denys Vlasenko' ;
> > > 'Fenghua Yu' ; 'H. Peter Anvin' ;
> 'Josh
> > > Poimboeuf' ; 'Linus Torvalds'
> > > ; 'Oleg Nesterov' ; 'Peter
> > > Zijlstra' ; 'Quentin Casasnovas'
> > > ; 'Thomas Gleixner' ;
> > > pbonz...@redhat.com; 'Ingo Molnar' 
> > > Subject: Re: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode
> > >
> > > On Fri, Jun 15, 2018 at 01:24:27PM +0900, Daniel Sangorrin wrote:
> > > > Hi Greg,
> > > >
> > > > >  /* Intel-defined CPU features, CPUID level 0x0001 (ecx), word 4 
> > > > > */
> > > > > --- a/arch/x86/include/asm/fpu/internal.h
> > > > > +++ b/arch/x86/include/asm/fpu/internal.h
> > > > > @@ -58,7 +58,7 @@ extern u64 fpu__get_supported_xfeatures_
> > > > >   */
> > > > >  static __always_inline __pure bool use_eager_fpu(void)
> > > > >  {
> > > > > - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
> > > > > + return true;
> > > > >  }
> > > >
> > > > Since this function returns always true then we can remove the code
> depending on
> > > lazy FPU mode.
> > > > Actually this has already been done in "x86/fpu: Remove use_eager_fpu()"
> > > > Ref: https://patchwork.kernel.org/patch/9365883/
> > > >
> > > > >  static void __init fpu__init_parse_early_param(void)
> > > > >  {
> > > > > - if (cmdline_find_option_bool(boot_command_line,
> "eagerfpu=off")) {
> > > > > - eagerfpu = DISABLE;
> > > > > - fpu__clear_eager_fpu_features();
> > > > > - }
> > > >
> > > > Since this patch removes the kernel boot parameter "eagerfpu", maybe we
> should
> > > remove it from the Documentation.
> > > > This has also been done by commit "x86/fpu: Finish excising 'eagerfpu'"
> > > > Ref: https://patchwork.kernel.org/patch/9380673/
> > > >
> > > > I will try backporting those patches unless anyone has any objections.
> > >
> > > What are the git commit ids of those patches in Linus's tree?  No need
> > > to point to patchwork links, I don't use that tool.
> >
> > OK, I got it.
> >
> > "x86/fpu: Remove use_eager_fpu()":
> c592b57347069abfc0dcad3b3a302cf882602597
> > "x86/fpu: Finish excising 'eagerfpu'":
> e63650840e8b053aa09ad934877e87e9941ed135
> 
> Minor nit.  For kernel commits, the "normal" way we reference them looks
> like this:
>   c592b5734706 ("x86/fpu: Remove use_eager_fpu()")
>   e63650840e8b ("x86/fpu: Finish excising 'eagerfpu'")
> 
> Which makes it much easier to read and understand.  That's what we use
> for the "Fixes:" tag in commits and in other places (text in commit
> messages.)
> 
> To automatically generate that format, you can just do:
>   git show -s --abbrev-commit --abbrev=12 --pretty=format:"%h
> (\"%s\")%n"
> 
> I recommend just setting up an alias for the above line, otherwise it's
> a pain to have to remember how to do it all the time.  Here's what I do:
>   $ alias gsr='git show -s --abbrev-commit --abbrev=12 --pretty=format:"%h
> (\"%s\")%n"'
>   $ gsr c592b57347069abfc0dcad3b3a302cf882602597
>   c592b5734706 ("x86/fpu: Remove use_eager_fpu()")
> 

Thanks a lot for the detailed explanation.

> > Unfortunately, they don't apply cleanly to stable kernels.
> 
> Should be very simple to backport if you want to.  Also I need copies
> for the 4.9.y tree as well if you do so.

OK, I will do it after the OSS Japan next week if you don't mind. I also want 
to run some tests for the FPU and try different boot parameter combinations 
(e.g. no387, nofxsr etc) to check I didn't break anything.

Thanks,
Daniel



> 
> thanks,
> 
> greg k-h

RE: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode

2018-06-14 Thread Daniel Sangorrin

> -Original Message-
> From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] On
> Behalf Of 'Greg Kroah-Hartman'
> Sent: Friday, June 15, 2018 1:56 PM
> To: Daniel Sangorrin 
> Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; 'Andy Lutomirski'
> ; 'Rik van Riel' ; 'Borislav Petkov'
> ; 'Brian Gerst' ; 'Dave Hansen'
> ; 'Denys Vlasenko' ;
> 'Fenghua Yu' ; 'H. Peter Anvin' ; 'Josh
> Poimboeuf' ; 'Linus Torvalds'
> ; 'Oleg Nesterov' ; 'Peter
> Zijlstra' ; 'Quentin Casasnovas'
> ; 'Thomas Gleixner' ;
> pbonz...@redhat.com; 'Ingo Molnar' 
> Subject: Re: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode
> 
> On Fri, Jun 15, 2018 at 01:24:27PM +0900, Daniel Sangorrin wrote:
> > Hi Greg,
> >
> > >  /* Intel-defined CPU features, CPUID level 0x0001 (ecx), word 4 */
> > > --- a/arch/x86/include/asm/fpu/internal.h
> > > +++ b/arch/x86/include/asm/fpu/internal.h
> > > @@ -58,7 +58,7 @@ extern u64 fpu__get_supported_xfeatures_
> > >   */
> > >  static __always_inline __pure bool use_eager_fpu(void)
> > >  {
> > > - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
> > > + return true;
> > >  }
> >
> > Since this function returns always true then we can remove the code 
> > depending on
> lazy FPU mode.
> > Actually this has already been done in "x86/fpu: Remove use_eager_fpu()"
> > Ref: https://patchwork.kernel.org/patch/9365883/
> >
> > >  static void __init fpu__init_parse_early_param(void)
> > >  {
> > > - if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
> > > - eagerfpu = DISABLE;
> > > - fpu__clear_eager_fpu_features();
> > > - }
> >
> > Since this patch removes the kernel boot parameter "eagerfpu", maybe we 
> > should
> remove it from the Documentation.
> > This has also been done by commit "x86/fpu: Finish excising 'eagerfpu'"
> > Ref: https://patchwork.kernel.org/patch/9380673/
> >
> > I will try backporting those patches unless anyone has any objections.
> 
> What are the git commit ids of those patches in Linus's tree?  No need
> to point to patchwork links, I don't use that tool.

OK, I got it.

"x86/fpu: Remove use_eager_fpu()": c592b57347069abfc0dcad3b3a302cf882602597
"x86/fpu: Finish excising 'eagerfpu'": e63650840e8b053aa09ad934877e87e9941ed135

Unfortunately, they don't apply cleanly to stable kernels.

Thanks,
Daniel Sangorrin

RE: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode

2018-06-14 Thread Daniel Sangorrin

> -Original Message-
> From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] On
> Behalf Of 'Greg Kroah-Hartman'
> Sent: Friday, June 15, 2018 1:56 PM
> To: Daniel Sangorrin 
> Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; 'Andy Lutomirski'
> ; 'Rik van Riel' ; 'Borislav Petkov'
> ; 'Brian Gerst' ; 'Dave Hansen'
> ; 'Denys Vlasenko' ;
> 'Fenghua Yu' ; 'H. Peter Anvin' ; 'Josh
> Poimboeuf' ; 'Linus Torvalds'
> ; 'Oleg Nesterov' ; 'Peter
> Zijlstra' ; 'Quentin Casasnovas'
> ; 'Thomas Gleixner' ;
> pbonz...@redhat.com; 'Ingo Molnar' 
> Subject: Re: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode
> 
> On Fri, Jun 15, 2018 at 01:24:27PM +0900, Daniel Sangorrin wrote:
> > Hi Greg,
> >
> > >  /* Intel-defined CPU features, CPUID level 0x0001 (ecx), word 4 */
> > > --- a/arch/x86/include/asm/fpu/internal.h
> > > +++ b/arch/x86/include/asm/fpu/internal.h
> > > @@ -58,7 +58,7 @@ extern u64 fpu__get_supported_xfeatures_
> > >   */
> > >  static __always_inline __pure bool use_eager_fpu(void)
> > >  {
> > > - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
> > > + return true;
> > >  }
> >
> > Since this function returns always true then we can remove the code 
> > depending on
> lazy FPU mode.
> > Actually this has already been done in "x86/fpu: Remove use_eager_fpu()"
> > Ref: https://patchwork.kernel.org/patch/9365883/
> >
> > >  static void __init fpu__init_parse_early_param(void)
> > >  {
> > > - if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
> > > - eagerfpu = DISABLE;
> > > - fpu__clear_eager_fpu_features();
> > > - }
> >
> > Since this patch removes the kernel boot parameter "eagerfpu", maybe we 
> > should
> remove it from the Documentation.
> > This has also been done by commit "x86/fpu: Finish excising 'eagerfpu'"
> > Ref: https://patchwork.kernel.org/patch/9380673/
> >
> > I will try backporting those patches unless anyone has any objections.
> 
> What are the git commit ids of those patches in Linus's tree?  No need
> to point to patchwork links, I don't use that tool.

OK, I got it.

"x86/fpu: Remove use_eager_fpu()": c592b57347069abfc0dcad3b3a302cf882602597
"x86/fpu: Finish excising 'eagerfpu'": e63650840e8b053aa09ad934877e87e9941ed135

Unfortunately, they don't apply cleanly to stable kernels.

Thanks,
Daniel Sangorrin

RE: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode

2018-06-14 Thread Daniel Sangorrin

Hi Greg,

>  /* Intel-defined CPU features, CPUID level 0x0001 (ecx), word 4 */
> --- a/arch/x86/include/asm/fpu/internal.h
> +++ b/arch/x86/include/asm/fpu/internal.h
> @@ -58,7 +58,7 @@ extern u64 fpu__get_supported_xfeatures_
>   */
>  static __always_inline __pure bool use_eager_fpu(void)
>  {
> - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
> + return true;
>  }

Since this function returns always true then we can remove the code depending 
on lazy FPU mode.
Actually this has already been done in "x86/fpu: Remove use_eager_fpu()"
Ref: https://patchwork.kernel.org/patch/9365883/

>  static void __init fpu__init_parse_early_param(void)
>  {
> - if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
> - eagerfpu = DISABLE;
> - fpu__clear_eager_fpu_features();
> - }

Since this patch removes the kernel boot parameter "eagerfpu", maybe we should 
remove it from the Documentation.
This has also been done by commit "x86/fpu: Finish excising 'eagerfpu'"
Ref: https://patchwork.kernel.org/patch/9380673/

I will try backporting those patches unless anyone has any objections.

Thanks,
Daniel Sangorrin

RE: [PATCH 4.4 10/24] x86/fpu: Hard-disable lazy FPU mode

2018-06-14 Thread Daniel Sangorrin

Hi Greg,

>  /* Intel-defined CPU features, CPUID level 0x0001 (ecx), word 4 */
> --- a/arch/x86/include/asm/fpu/internal.h
> +++ b/arch/x86/include/asm/fpu/internal.h
> @@ -58,7 +58,7 @@ extern u64 fpu__get_supported_xfeatures_
>   */
>  static __always_inline __pure bool use_eager_fpu(void)
>  {
> - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
> + return true;
>  }

Since this function returns always true then we can remove the code depending 
on lazy FPU mode.
Actually this has already been done in "x86/fpu: Remove use_eager_fpu()"
Ref: https://patchwork.kernel.org/patch/9365883/

>  static void __init fpu__init_parse_early_param(void)
>  {
> - if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
> - eagerfpu = DISABLE;
> - fpu__clear_eager_fpu_features();
> - }

Since this patch removes the kernel boot parameter "eagerfpu", maybe we should 
remove it from the Documentation.
This has also been done by commit "x86/fpu: Finish excising 'eagerfpu'"
Ref: https://patchwork.kernel.org/patch/9380673/

I will try backporting those patches unless anyone has any objections.

Thanks,
Daniel Sangorrin

RE: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page protection"

2018-06-03 Thread Daniel Sangorrin

> -Original Message-
> From: Naresh Kamboju [mailto:naresh.kamb...@linaro.org]
> Sent: Friday, June 1, 2018 12:55 AM
> To: Daniel Sangorrin 
> Cc: Greg Kroah-Hartman ; open list
> ; linux- stable ;
> Davidlohr Bueso ; Joe Lawrence ;
> Andrea Arcangeli ; Manfred Spraul
> ; Andrew Morton ;
> Linus Torvalds 
> Subject: Re: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page
> protection"
> 
> On 31 May 2018 at 13:06, Daniel Sangorrin
>  wrote:
> >> -Original Message-
> >> From: 'Greg Kroah-Hartman' [mailto:gre...@linuxfoundation.org]
> > ..
> >> Thanks for letting us know, but this was reported already.  See the
> >> emails on lkml with the subject:
> >>   Subject: Re: [PATCH 4.16 000/272] 4.16.13-stable review
> >> from Davidlohr Bueso
> >>   Message-ID: <20180528213039.yy2madue67njkmw5@linux-n805>
> >>
> >> where he discusses that the LTP test is incorrect and that the kernel
> >> change is correct and that LTP is going to be fixed because of this.
> 
> My two cents,
> If you are referring to cve-2017-5669.c
> LTP test case is been fixed few hours ago by Rafael Tinoco,
> 
> - shm_addr = shmat(shm_id, ((void *)1), SHM_RND);
> + shm_addr = shmat(shm_id, ((void *)1), SHM_RND | SHM_REMAP);
> 
> LTP patch pull request and it is been merged.
> https://github.com/linux-test-project/ltp/pull/324

Thanks a lot Naresh.
I confirmed that the latest LTP cve-2017-5669 now PASSes.

Thanks,
Daniel

RE: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page protection"

2018-06-03 Thread Daniel Sangorrin

> -Original Message-
> From: Naresh Kamboju [mailto:naresh.kamb...@linaro.org]
> Sent: Friday, June 1, 2018 12:55 AM
> To: Daniel Sangorrin 
> Cc: Greg Kroah-Hartman ; open list
> ; linux- stable ;
> Davidlohr Bueso ; Joe Lawrence ;
> Andrea Arcangeli ; Manfred Spraul
> ; Andrew Morton ;
> Linus Torvalds 
> Subject: Re: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page
> protection"
> 
> On 31 May 2018 at 13:06, Daniel Sangorrin
>  wrote:
> >> -Original Message-
> >> From: 'Greg Kroah-Hartman' [mailto:gre...@linuxfoundation.org]
> > ..
> >> Thanks for letting us know, but this was reported already.  See the
> >> emails on lkml with the subject:
> >>   Subject: Re: [PATCH 4.16 000/272] 4.16.13-stable review
> >> from Davidlohr Bueso
> >>   Message-ID: <20180528213039.yy2madue67njkmw5@linux-n805>
> >>
> >> where he discusses that the LTP test is incorrect and that the kernel
> >> change is correct and that LTP is going to be fixed because of this.
> 
> My two cents,
> If you are referring to cve-2017-5669.c
> LTP test case is been fixed few hours ago by Rafael Tinoco,
> 
> - shm_addr = shmat(shm_id, ((void *)1), SHM_RND);
> + shm_addr = shmat(shm_id, ((void *)1), SHM_RND | SHM_REMAP);
> 
> LTP patch pull request and it is been merged.
> https://github.com/linux-test-project/ltp/pull/324

Thanks a lot Naresh.
I confirmed that the latest LTP cve-2017-5669 now PASSes.

Thanks,
Daniel

RE: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page protection"

2018-05-31 Thread Daniel Sangorrin

> -Original Message-
> From: 'Greg Kroah-Hartman' [mailto:gre...@linuxfoundation.org]
..
> Thanks for letting us know, but this was reported already.  See the
> emails on lkml with the subject:
>   Subject: Re: [PATCH 4.16 000/272] 4.16.13-stable review
> from Davidlohr Bueso
>   Message-ID: <20180528213039.yy2madue67njkmw5@linux-n805>
> 
> where he discusses that the LTP test is incorrect and that the kernel
> change is correct and that LTP is going to be fixed because of this.

OK, thank you for the information!

Regards,
Daniel

RE: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page protection"

2018-05-31 Thread Daniel Sangorrin

> -Original Message-
> From: 'Greg Kroah-Hartman' [mailto:gre...@linuxfoundation.org]
..
> Thanks for letting us know, but this was reported already.  See the
> emails on lkml with the subject:
>   Subject: Re: [PATCH 4.16 000/272] 4.16.13-stable review
> from Davidlohr Bueso
>   Message-ID: <20180528213039.yy2madue67njkmw5@linux-n805>
> 
> where he discusses that the LTP test is incorrect and that the kernel
> change is correct and that LTP is going to be fixed because of this.

OK, thank you for the information!

Regards,
Daniel

RE: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page protection"

2018-05-30 Thread Daniel Sangorrin

> -Original Message-
> From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] On
> 4.4-stable review patch.  If anyone has any objections, please let me know.
> 
> --
> 
> From: Davidlohr Bueso 
> 
> commit a73ab244f0dad8fffb3291b905f73e2d3eaa7c00 upstream.
> 
> Patch series "ipc/shm: shmat() fixes around nil-page".

Sorry for being a bit late (the pace is really fast here).

I have found a regression from 4.4.133-rc1 to 4.4.134-rc1 using Fuego LTP 
wrapper.

4.4.134-rc1
tst_test.c:982: INFO: Timeout per run is 0h 05m 00s
cve-2017-5669.c:62: INFO: Attempting to attach shared memory to null 
page
cve-2017-5669.c:74: INFO: Mapped shared memory to (nil)
cve-2017-5669.c:78: FAIL: We have mapped a VM address within the first 
64Kb
cve-2017-5669.c:84: INFO: Touching shared memory to see if anything 
strange happens

4.4.133-rc1:
tst_test.c:982: INFO: Timeout per run is 0h 05m 00s
cve-2017-5669.c:62: INFO: Attempting to attach shared memory to null 
page
cve-2017-5669.c:67: PASS: shmat returned EINVAL

The culprits should be one or both of the two last commits to ipc/shm (one of 
them a revert).

- ipc/shm: fix shmat() nil address after round-down when remapping
- Revert "ipc/shm: Fix shmat mmap nil-page protection"

I need to investigate the concrete reason, but for now I just wanted to report 
it.

Thanks,
Daniel









> 
> These patches fix two issues reported[1] a while back by Joe and Andrea
> around how shmat(2) behaves with nil-page.
> 
> The first reverts a commit that it was incorrectly thought that mapping
> nil-page (address=0) was a no no with MAP_FIXED.  This is not the case,
> with the exception of SHM_REMAP; which is address in the second patch.
> 
> I chose two patches because it is easier to backport and it explicitly
> reverts bogus behaviour.  Both patches ought to be in -stable and ltp
> testcases need updated (the added testcase around the cve can be
> modified to just test for SHM_RND|SHM_REMAP).
> 
> [1] lkml.kernel.org/r/20180430172152.nfa564pvgpk3ut7p@linux-n805
> 
> This patch (of 2):
> 
> Commit 95e91b831f87 ("ipc/shm: Fix shmat mmap nil-page protection")
> worked on the idea that we should not be mapping as root addr=0 and
> MAP_FIXED.  However, it was reported that this scenario is in fact
> valid, thus making the patch both bogus and breaks userspace as well.
> 
> For example X11's libint10.so relies on shmat(1, SHM_RND) for lowmem
> initialization[1].
> 
> [1]
> https://cgit.freedesktop.org/xorg/xserver/tree/hw/xfree86/os-support/linux/int1
> 0/linux.c#n347
> Link: http://lkml.kernel.org/r/20180503203243.15045-2-d...@stgolabs.net
> Fixes: 95e91b831f87 ("ipc/shm: Fix shmat mmap nil-page protection")
> Signed-off-by: Davidlohr Bueso 
> Reported-by: Joe Lawrence 
> Reported-by: Andrea Arcangeli 
> Cc: Manfred Spraul 
> Cc: 
> Signed-off-by: Andrew Morton 
> Signed-off-by: Linus Torvalds 
> Signed-off-by: Greg Kroah-Hartman 
> 
> ---
>  ipc/shm.c |9 ++---
>  1 file changed, 2 insertions(+), 7 deletions(-)
> 
> --- a/ipc/shm.c
> +++ b/ipc/shm.c
> @@ -1113,13 +1113,8 @@ long do_shmat(int shmid, char __user *sh
>   goto out;
>   else if ((addr = (ulong)shmaddr)) {
>   if (addr & (shmlba - 1)) {
> - /*
> -  * Round down to the nearest multiple of shmlba.
> -  * For sane do_mmap_pgoff() parameters, avoid
> -  * round downs that trigger nil-page and MAP_FIXED.
> -  */
> - if ((shmflg & SHM_RND) && addr >= shmlba)
> - addr &= ~(shmlba - 1);
> + if (shmflg & SHM_RND)
> + addr &= ~(shmlba - 1);  /* round down */
>   else
>  #ifndef __ARCH_FORCE_SHMLBA
>   if (addr & ~PAGE_MASK)
> 
>

RE: [PATCH 4.4 011/268] Revert "ipc/shm: Fix shmat mmap nil-page protection"

2018-05-30 Thread Daniel Sangorrin

> -Original Message-
> From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] On
> 4.4-stable review patch.  If anyone has any objections, please let me know.
> 
> --
> 
> From: Davidlohr Bueso 
> 
> commit a73ab244f0dad8fffb3291b905f73e2d3eaa7c00 upstream.
> 
> Patch series "ipc/shm: shmat() fixes around nil-page".

Sorry for being a bit late (the pace is really fast here).

I have found a regression from 4.4.133-rc1 to 4.4.134-rc1 using Fuego LTP 
wrapper.

4.4.134-rc1
tst_test.c:982: INFO: Timeout per run is 0h 05m 00s
cve-2017-5669.c:62: INFO: Attempting to attach shared memory to null 
page
cve-2017-5669.c:74: INFO: Mapped shared memory to (nil)
cve-2017-5669.c:78: FAIL: We have mapped a VM address within the first 
64Kb
cve-2017-5669.c:84: INFO: Touching shared memory to see if anything 
strange happens

4.4.133-rc1:
tst_test.c:982: INFO: Timeout per run is 0h 05m 00s
cve-2017-5669.c:62: INFO: Attempting to attach shared memory to null 
page
cve-2017-5669.c:67: PASS: shmat returned EINVAL

The culprits should be one or both of the two last commits to ipc/shm (one of 
them a revert).

- ipc/shm: fix shmat() nil address after round-down when remapping
- Revert "ipc/shm: Fix shmat mmap nil-page protection"

I need to investigate the concrete reason, but for now I just wanted to report 
it.

Thanks,
Daniel









> 
> These patches fix two issues reported[1] a while back by Joe and Andrea
> around how shmat(2) behaves with nil-page.
> 
> The first reverts a commit that it was incorrectly thought that mapping
> nil-page (address=0) was a no no with MAP_FIXED.  This is not the case,
> with the exception of SHM_REMAP; which is address in the second patch.
> 
> I chose two patches because it is easier to backport and it explicitly
> reverts bogus behaviour.  Both patches ought to be in -stable and ltp
> testcases need updated (the added testcase around the cve can be
> modified to just test for SHM_RND|SHM_REMAP).
> 
> [1] lkml.kernel.org/r/20180430172152.nfa564pvgpk3ut7p@linux-n805
> 
> This patch (of 2):
> 
> Commit 95e91b831f87 ("ipc/shm: Fix shmat mmap nil-page protection")
> worked on the idea that we should not be mapping as root addr=0 and
> MAP_FIXED.  However, it was reported that this scenario is in fact
> valid, thus making the patch both bogus and breaks userspace as well.
> 
> For example X11's libint10.so relies on shmat(1, SHM_RND) for lowmem
> initialization[1].
> 
> [1]
> https://cgit.freedesktop.org/xorg/xserver/tree/hw/xfree86/os-support/linux/int1
> 0/linux.c#n347
> Link: http://lkml.kernel.org/r/20180503203243.15045-2-d...@stgolabs.net
> Fixes: 95e91b831f87 ("ipc/shm: Fix shmat mmap nil-page protection")
> Signed-off-by: Davidlohr Bueso 
> Reported-by: Joe Lawrence 
> Reported-by: Andrea Arcangeli 
> Cc: Manfred Spraul 
> Cc: 
> Signed-off-by: Andrew Morton 
> Signed-off-by: Linus Torvalds 
> Signed-off-by: Greg Kroah-Hartman 
> 
> ---
>  ipc/shm.c |9 ++---
>  1 file changed, 2 insertions(+), 7 deletions(-)
> 
> --- a/ipc/shm.c
> +++ b/ipc/shm.c
> @@ -1113,13 +1113,8 @@ long do_shmat(int shmid, char __user *sh
>   goto out;
>   else if ((addr = (ulong)shmaddr)) {
>   if (addr & (shmlba - 1)) {
> - /*
> -  * Round down to the nearest multiple of shmlba.
> -  * For sane do_mmap_pgoff() parameters, avoid
> -  * round downs that trigger nil-page and MAP_FIXED.
> -  */
> - if ((shmflg & SHM_RND) && addr >= shmlba)
> - addr &= ~(shmlba - 1);
> + if (shmflg & SHM_RND)
> + addr &= ~(shmlba - 1);  /* round down */
>   else
>  #ifndef __ARCH_FORCE_SHMLBA
>   if (addr & ~PAGE_MASK)
> 
>

RE: [PATCH 4.4 00/92] 4.4.133-stable review

2018-05-25 Thread Daniel Sangorrin

> -Original Message-
> From: Rafael Tinoco [mailto:rafael.tin...@linaro.org]
> 
> Thank you Daniel! Will investigate those.

OK, thank you :).
Notice that I did those tests on x86_64. It seems you are testing on arm, so 
there may be some differences.

I just checked these tests on 4.4.133 (on x86_64):
fcntl35: PASS
fcntl35_64: PASS
select04: PASS

I am currently investigating other tests that are failing as well. They are not 
regressions, just some patches have not been backported yet.

Thanks,
Daniel


> 
> Meanwhile, Greg, I referred to:
> 
> time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting
> 
> Since we're not using this type of clock on arm64's 4.4 kernel vdso
> functions. This commit's description calls attention for it to be
> responsible for fixing kselftest flacking tests, we wouldn't get that
> on 4.4 according to bellow:
> 
> stable-rc-linux-4.14.y
> dbb236c1ceb6 arm64/vdso: Fix nsec handling for CLOCK_MONOTONIC_RAW
> 49eea433b326 arm64: Add support for CLOCK_MONOTONIC_RAW in
> clock_gettime() vDSO
> 82e88ff1ea94 hrtimer: Revert CLOCK_MONOTONIC_RAW support
> 9c808765e88e hrtimer: Add support for CLOCK_MONOTONIC_RAW
> 
> stable-rc-linux-4.16.y
> dbb236c1ceb6 arm64/vdso: Fix nsec handling for CLOCK_MONOTONIC_RAW
> 49eea433b326 arm64: Add support for CLOCK_MONOTONIC_RAW in
> clock_gettime() vDSO
> 82e88ff1ea94 hrtimer: Revert CLOCK_MONOTONIC_RAW support
> 9c808765e88e hrtimer: Add support for CLOCK_MONOTONIC_RAW
> 
> stable-rc-linux-4.4.y
> 
> 
> stable-rc-linux-4.9.y
> 99f66b5182a4 arm64/vdso: Fix nsec handling for CLOCK_MONOTONIC_RAW
> 49eea433b326 arm64: Add support for CLOCK_MONOTONIC_RAW in
> clock_gettime() vDSO
> 82e88ff1ea94 hrtimer: Revert CLOCK_MONOTONIC_RAW support
> 9c808765e88e hrtimer: Add support for CLOCK_MONOTONIC_RAW
> 
> Yet, the second fix was backported to all (including 4.4.y):
> 
> stable-rc-linux-4.14.y
> 3d88d56c5873 time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting
> stable-rc-linux-4.16.y
> 3d88d56c5873 time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting
> stable-rc-linux-4.4.y
> 7c8bd6e07430 time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting
> stable-rc-linux-4.9.y
> a53bfdda06ac time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting
> 
> Not sure you want to keep it in 4.4, thought it was worth mentioning it.
> 
> Cheers.
> 
> On 24 May 2018 at 22:34, Daniel Sangorrin
> <daniel.sangor...@toshiba.co.jp> wrote:
> > Hello Rafael,
> >
> > The tests fcntl35 and fcntl35_64 should have go from FAIL to PASS.
> > https://www.spinics.net/lists/stable/msg239475.html
> >
> > Looking at
> >
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a
> 3d7cdea9/testrun/228569/suite/ltp-syscalls-tests/tests/
> > I see that these two tests (and other important tests as well) are being 
> > SKIPPED.
> >
> > By the way, I see that select04 FAILS in your case. But in my setup, 
> > select04 was
> working fine (x86_64) in 4.4.132. I will confirm that it still works in 
> 4.4.133
> >
> > Thanks,
> > Daniel Sangorrin
> >
> >> -Original Message-
> >> From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org]
> On
> >> Behalf Of Rafael Tinoco
> >> Sent: Friday, May 25, 2018 5:32 AM
> >> To: Greg Kroah-Hartman <gre...@linuxfoundation.org>
> >> Cc: linux-kernel@vger.kernel.org; sh...@kernel.org; patc...@kernelci.org;
> >> lkft-tri...@lists.linaro.org; ben.hutchi...@codethink.co.uk;
> >> sta...@vger.kernel.org; a...@linux-foundation.org;
> >> torva...@linux-foundation.org; li...@roeck-us.net
> >> Subject: Re: [PATCH 4.4 00/92] 4.4.133-stable review
> >>
> >> > > kernel: 4.4.133-rc1
> >> > > git repo:
> >> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> >> > > git branch: linux-4.4.y
> >> > > git commit: 915a3d7cdea9daa9e9fb6b855f10c1312e6910c4
> >> > > git describe: v4.4.132-93-g915a3d7cdea9
> >> > > Test details:
> >>
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a
> >> 3d7cdea9
> >> > >
> >> > >
> >> > > No regressions (compared to build v4.4.132-71-g180635995c36)
> >> >
> >> > It should have gotten better, as there was a fix in here for at least 2
> >> > LTP tests that we previously were not passing.  I don't know why you all
> >> > were not reporting that in the past, it took someone else randomly
> >> > deciding to run LTP to report it to me...
> >> >
> >> > Why did an improvement in results not show up?
> >>
> >> Are you referring to the CLOCK_MONOTONIC_RAW fix for the arm64 vDSO ?
> >> I think that CLOCK_MONOTONIC_RAW in VDSO wasn't backported to 4.4.y
> >> (commit 49eea433b326 in mainline) so this "fix" is changing the
> >> timekeeping sauce (that would fix MONOTONIC RAW) but not for 4.4.y in
> >> ARM64. Needs review though :\
> >
> >
> >

RE: [PATCH 4.4 00/92] 4.4.133-stable review

2018-05-25 Thread Daniel Sangorrin

> -Original Message-
> From: Rafael Tinoco [mailto:rafael.tin...@linaro.org]
> 
> Thank you Daniel! Will investigate those.

OK, thank you :).
Notice that I did those tests on x86_64. It seems you are testing on arm, so 
there may be some differences.

I just checked these tests on 4.4.133 (on x86_64):
fcntl35: PASS
fcntl35_64: PASS
select04: PASS

I am currently investigating other tests that are failing as well. They are not 
regressions, just some patches have not been backported yet.

Thanks,
Daniel


> 
> Meanwhile, Greg, I referred to:
> 
> time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting
> 
> Since we're not using this type of clock on arm64's 4.4 kernel vdso
> functions. This commit's description calls attention for it to be
> responsible for fixing kselftest flacking tests, we wouldn't get that
> on 4.4 according to bellow:
> 
> stable-rc-linux-4.14.y
> dbb236c1ceb6 arm64/vdso: Fix nsec handling for CLOCK_MONOTONIC_RAW
> 49eea433b326 arm64: Add support for CLOCK_MONOTONIC_RAW in
> clock_gettime() vDSO
> 82e88ff1ea94 hrtimer: Revert CLOCK_MONOTONIC_RAW support
> 9c808765e88e hrtimer: Add support for CLOCK_MONOTONIC_RAW
> 
> stable-rc-linux-4.16.y
> dbb236c1ceb6 arm64/vdso: Fix nsec handling for CLOCK_MONOTONIC_RAW
> 49eea433b326 arm64: Add support for CLOCK_MONOTONIC_RAW in
> clock_gettime() vDSO
> 82e88ff1ea94 hrtimer: Revert CLOCK_MONOTONIC_RAW support
> 9c808765e88e hrtimer: Add support for CLOCK_MONOTONIC_RAW
> 
> stable-rc-linux-4.4.y
> 
> 
> stable-rc-linux-4.9.y
> 99f66b5182a4 arm64/vdso: Fix nsec handling for CLOCK_MONOTONIC_RAW
> 49eea433b326 arm64: Add support for CLOCK_MONOTONIC_RAW in
> clock_gettime() vDSO
> 82e88ff1ea94 hrtimer: Revert CLOCK_MONOTONIC_RAW support
> 9c808765e88e hrtimer: Add support for CLOCK_MONOTONIC_RAW
> 
> Yet, the second fix was backported to all (including 4.4.y):
> 
> stable-rc-linux-4.14.y
> 3d88d56c5873 time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting
> stable-rc-linux-4.16.y
> 3d88d56c5873 time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting
> stable-rc-linux-4.4.y
> 7c8bd6e07430 time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting
> stable-rc-linux-4.9.y
> a53bfdda06ac time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting
> 
> Not sure you want to keep it in 4.4, thought it was worth mentioning it.
> 
> Cheers.
> 
> On 24 May 2018 at 22:34, Daniel Sangorrin
>  wrote:
> > Hello Rafael,
> >
> > The tests fcntl35 and fcntl35_64 should have go from FAIL to PASS.
> > https://www.spinics.net/lists/stable/msg239475.html
> >
> > Looking at
> >
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a
> 3d7cdea9/testrun/228569/suite/ltp-syscalls-tests/tests/
> > I see that these two tests (and other important tests as well) are being 
> > SKIPPED.
> >
> > By the way, I see that select04 FAILS in your case. But in my setup, 
> > select04 was
> working fine (x86_64) in 4.4.132. I will confirm that it still works in 
> 4.4.133
> >
> > Thanks,
> > Daniel Sangorrin
> >
> >> -Original Message-
> >> From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org]
> On
> >> Behalf Of Rafael Tinoco
> >> Sent: Friday, May 25, 2018 5:32 AM
> >> To: Greg Kroah-Hartman 
> >> Cc: linux-kernel@vger.kernel.org; sh...@kernel.org; patc...@kernelci.org;
> >> lkft-tri...@lists.linaro.org; ben.hutchi...@codethink.co.uk;
> >> sta...@vger.kernel.org; a...@linux-foundation.org;
> >> torva...@linux-foundation.org; li...@roeck-us.net
> >> Subject: Re: [PATCH 4.4 00/92] 4.4.133-stable review
> >>
> >> > > kernel: 4.4.133-rc1
> >> > > git repo:
> >> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> >> > > git branch: linux-4.4.y
> >> > > git commit: 915a3d7cdea9daa9e9fb6b855f10c1312e6910c4
> >> > > git describe: v4.4.132-93-g915a3d7cdea9
> >> > > Test details:
> >>
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a
> >> 3d7cdea9
> >> > >
> >> > >
> >> > > No regressions (compared to build v4.4.132-71-g180635995c36)
> >> >
> >> > It should have gotten better, as there was a fix in here for at least 2
> >> > LTP tests that we previously were not passing.  I don't know why you all
> >> > were not reporting that in the past, it took someone else randomly
> >> > deciding to run LTP to report it to me...
> >> >
> >> > Why did an improvement in results not show up?
> >>
> >> Are you referring to the CLOCK_MONOTONIC_RAW fix for the arm64 vDSO ?
> >> I think that CLOCK_MONOTONIC_RAW in VDSO wasn't backported to 4.4.y
> >> (commit 49eea433b326 in mainline) so this "fix" is changing the
> >> timekeeping sauce (that would fix MONOTONIC RAW) but not for 4.4.y in
> >> ARM64. Needs review though :\
> >
> >
> >

RE: [PATCH 4.4 00/92] 4.4.133-stable review

2018-05-24 Thread Daniel Sangorrin

Hello Rafael,

The tests fcntl35 and fcntl35_64 should have go from FAIL to PASS.
https://www.spinics.net/lists/stable/msg239475.html

Looking at
https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a3d7cdea9/testrun/228569/suite/ltp-syscalls-tests/tests/
I see that these two tests (and other important tests as well) are being 
SKIPPED.

By the way, I see that select04 FAILS in your case. But in my setup, select04 
was working fine (x86_64) in 4.4.132. I will confirm that it still works in 
4.4.133

Thanks,
Daniel Sangorrin

> -Original Message-
> From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] On
> Behalf Of Rafael Tinoco
> Sent: Friday, May 25, 2018 5:32 AM
> To: Greg Kroah-Hartman <gre...@linuxfoundation.org>
> Cc: linux-kernel@vger.kernel.org; sh...@kernel.org; patc...@kernelci.org;
> lkft-tri...@lists.linaro.org; ben.hutchi...@codethink.co.uk;
> sta...@vger.kernel.org; a...@linux-foundation.org;
> torva...@linux-foundation.org; li...@roeck-us.net
> Subject: Re: [PATCH 4.4 00/92] 4.4.133-stable review
> 
> > > kernel: 4.4.133-rc1
> > > git repo:
> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> > > git branch: linux-4.4.y
> > > git commit: 915a3d7cdea9daa9e9fb6b855f10c1312e6910c4
> > > git describe: v4.4.132-93-g915a3d7cdea9
> > > Test details:
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a
> 3d7cdea9
> > >
> > >
> > > No regressions (compared to build v4.4.132-71-g180635995c36)
> >
> > It should have gotten better, as there was a fix in here for at least 2
> > LTP tests that we previously were not passing.  I don't know why you all
> > were not reporting that in the past, it took someone else randomly
> > deciding to run LTP to report it to me...
> >
> > Why did an improvement in results not show up?
> 
> Are you referring to the CLOCK_MONOTONIC_RAW fix for the arm64 vDSO ?
> I think that CLOCK_MONOTONIC_RAW in VDSO wasn't backported to 4.4.y
> (commit 49eea433b326 in mainline) so this "fix" is changing the
> timekeeping sauce (that would fix MONOTONIC RAW) but not for 4.4.y in
> ARM64. Needs review though :\

RE: [PATCH 4.4 00/92] 4.4.133-stable review

2018-05-24 Thread Daniel Sangorrin

Hello Rafael,

The tests fcntl35 and fcntl35_64 should have go from FAIL to PASS.
https://www.spinics.net/lists/stable/msg239475.html

Looking at
https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a3d7cdea9/testrun/228569/suite/ltp-syscalls-tests/tests/
I see that these two tests (and other important tests as well) are being 
SKIPPED.

By the way, I see that select04 FAILS in your case. But in my setup, select04 
was working fine (x86_64) in 4.4.132. I will confirm that it still works in 
4.4.133

Thanks,
Daniel Sangorrin

> -Original Message-
> From: stable-ow...@vger.kernel.org [mailto:stable-ow...@vger.kernel.org] On
> Behalf Of Rafael Tinoco
> Sent: Friday, May 25, 2018 5:32 AM
> To: Greg Kroah-Hartman 
> Cc: linux-kernel@vger.kernel.org; sh...@kernel.org; patc...@kernelci.org;
> lkft-tri...@lists.linaro.org; ben.hutchi...@codethink.co.uk;
> sta...@vger.kernel.org; a...@linux-foundation.org;
> torva...@linux-foundation.org; li...@roeck-us.net
> Subject: Re: [PATCH 4.4 00/92] 4.4.133-stable review
> 
> > > kernel: 4.4.133-rc1
> > > git repo:
> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> > > git branch: linux-4.4.y
> > > git commit: 915a3d7cdea9daa9e9fb6b855f10c1312e6910c4
> > > git describe: v4.4.132-93-g915a3d7cdea9
> > > Test details:
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.132-93-g915a
> 3d7cdea9
> > >
> > >
> > > No regressions (compared to build v4.4.132-71-g180635995c36)
> >
> > It should have gotten better, as there was a fix in here for at least 2
> > LTP tests that we previously were not passing.  I don't know why you all
> > were not reporting that in the past, it took someone else randomly
> > deciding to run LTP to report it to me...
> >
> > Why did an improvement in results not show up?
> 
> Are you referring to the CLOCK_MONOTONIC_RAW fix for the arm64 vDSO ?
> I think that CLOCK_MONOTONIC_RAW in VDSO wasn't backported to 4.4.y
> (commit 49eea433b326 in mainline) so this "fix" is changing the
> timekeeping sauce (that would fix MONOTONIC RAW) but not for 4.4.y in
> ARM64. Needs review though :\

RE: [kernel-hardening] [RFC PATCH 1/1] seccomp: provide information about the previous syscall

2016-01-24 Thread Daniel Sangorrin

Hi,

Jann, Andy, Alexei, Kees and Paul: thanks a lot for your comments on my RFC!!.

There were a few important points that I didn't mention but are critical to 
understand
what I was trying to do.  The focus of the patch was on protecting "real-time 
embedded IoT devices" such as a PLC (programmable logic controller) inside a 
factory 
assembly line . 

They have a few important properties that I took into consideration:

- They often rely on firewall technology, and are not updated for many 
   years (~20 years). For that reason, I think that a white-list approach 
(define 
   the correct behaviour) seems suitable. Note also that the typical problem
   of white list approaches, false-positives, is unlikely to occur because 
they
   are very deterministic systems.

- No asynchronous signal handlers: real-time applications need deterministic
   response times. For that reason, signals are handled synchronously 
typically
   by using 'sigtimedwait' on a separate thread.

- Initialization vs cycle: real-time applications usually have an 
initialization phase
   where memory and stack are locked into RAM and threads are created. After
   the initialization phase, threads typically loop through periodic cycles 
and 
   perform their tasks. The important point here is that once the 
initialization 
   is done we can ban any further calls to 'clone', 'execve', 'mprotect' 
and the like. 
   This can be done already by installing an extra filter. For the cyclic 
phase, my
   patch would allow enforcing the order of the system calls inside the 
cycles.
   (e.g.: read sensor, send a message, and write to an actuator). Despite 
the
   fact that the attacker cannot call 'clone' anymore, he could try to 
alter the 
   control of an external actuator (e.g. a motor) by using the 'ioctl' 
system call
   for example.

- Mimicry: as I mentioned in the cover letter (and Jann showed with
   his ROP attack) if the attacker is able to emulate the system call's 
order
   (plus its arguments  and the address from which the call was made)
   this patch can be bypassed. However, note that this is not easy for 
several
   reasons:
   + the attacker may need a long stack to mimic all the system 
calls and their
   arguments.
   + the stealthy attacker must make sure the real-time application 
does not
   crash, miss any of its deadlines or cause deadline misses in 
other apps
   [Note] Real-time application binaries are usually closed 
source so 
this might require quite a bit of effort.
   + randomized system calls: applications could randomly activate 
dummy 
   system calls each time they are instantiated (and adjust 
their BPF filter,
   which should later be zeroed).  In this case, the attacker 
(or virus) 
   would need to figure out which dummy system calls have to
   be mimicked and prepare a stack accordingly. This seems 
challenging.
   [Note] under a brute force attack, the application may just 
raise an alarm,
activate a redundant node (not connected to the network) 
and 
commit digital suicide :).

About the ABI, by all means I don't want to break it. If putting the field at
the end does not break it, as Alexei mentioned, I can change it. Also I would
be glad to review the SECCOMP_FILTER_FLAG_TSYNC flag mentioned by Jann
in case there is any interest.

However, I'll understand the NACK if you think that the maintenance is not 
worth it
as Andy mentioned; that it can be bypassed under certain conditions; or the 
fact 
that it focuses on a particular type of systems. I will keep reading the
messages in the kernel-hardening list and see if I find another topic to
contribute :).

Thanks a lot for your consideration and comments,
Daniel

RE: [kernel-hardening] [RFC PATCH 1/1] seccomp: provide information about the previous syscall

2016-01-24 Thread Daniel Sangorrin

Hi,

Jann, Andy, Alexei, Kees and Paul: thanks a lot for your comments on my RFC!!.

There were a few important points that I didn't mention but are critical to 
understand
what I was trying to do.  The focus of the patch was on protecting "real-time 
embedded IoT devices" such as a PLC (programmable logic controller) inside a 
factory 
assembly line . 

They have a few important properties that I took into consideration:

- They often rely on firewall technology, and are not updated for many 
   years (~20 years). For that reason, I think that a white-list approach 
(define 
   the correct behaviour) seems suitable. Note also that the typical problem
   of white list approaches, false-positives, is unlikely to occur because 
they
   are very deterministic systems.

- No asynchronous signal handlers: real-time applications need deterministic
   response times. For that reason, signals are handled synchronously 
typically
   by using 'sigtimedwait' on a separate thread.

- Initialization vs cycle: real-time applications usually have an 
initialization phase
   where memory and stack are locked into RAM and threads are created. After
   the initialization phase, threads typically loop through periodic cycles 
and 
   perform their tasks. The important point here is that once the 
initialization 
   is done we can ban any further calls to 'clone', 'execve', 'mprotect' 
and the like. 
   This can be done already by installing an extra filter. For the cyclic 
phase, my
   patch would allow enforcing the order of the system calls inside the 
cycles.
   (e.g.: read sensor, send a message, and write to an actuator). Despite 
the
   fact that the attacker cannot call 'clone' anymore, he could try to 
alter the 
   control of an external actuator (e.g. a motor) by using the 'ioctl' 
system call
   for example.

- Mimicry: as I mentioned in the cover letter (and Jann showed with
   his ROP attack) if the attacker is able to emulate the system call's 
order
   (plus its arguments  and the address from which the call was made)
   this patch can be bypassed. However, note that this is not easy for 
several
   reasons:
   + the attacker may need a long stack to mimic all the system 
calls and their
   arguments.
   + the stealthy attacker must make sure the real-time application 
does not
   crash, miss any of its deadlines or cause deadline misses in 
other apps
   [Note] Real-time application binaries are usually closed 
source so 
this might require quite a bit of effort.
   + randomized system calls: applications could randomly activate 
dummy 
   system calls each time they are instantiated (and adjust 
their BPF filter,
   which should later be zeroed).  In this case, the attacker 
(or virus) 
   would need to figure out which dummy system calls have to
   be mimicked and prepare a stack accordingly. This seems 
challenging.
   [Note] under a brute force attack, the application may just 
raise an alarm,
activate a redundant node (not connected to the network) 
and 
commit digital suicide :).

About the ABI, by all means I don't want to break it. If putting the field at
the end does not break it, as Alexei mentioned, I can change it. Also I would
be glad to review the SECCOMP_FILTER_FLAG_TSYNC flag mentioned by Jann
in case there is any interest.

However, I'll understand the NACK if you think that the maintenance is not 
worth it
as Andy mentioned; that it can be bypassed under certain conditions; or the 
fact 
that it focuses on a particular type of systems. I will keep reading the
messages in the kernel-hardening list and see if I find another topic to
contribute :).

Thanks a lot for your consideration and comments,
Daniel

[RFC PATCH 0/1] Adding previous syscall context to seccomp

2016-01-21 Thread Daniel Sangorrin

Hi,

During my presentation last year at Linuxcon Japan [1], I released
a proof-of-concept patch [2] for the seccomp subsystem. The main
purpose of that patch was to let applications restrict the order
in which their system calls are requested. In more technical terms,
a host-based anomaly intrusion detection system (HIDS) that uses call
sequence monitoring for detecting unusual patterns. For example,
to detect when the execution flow unexpectedly diverts towards the
'mprotect' syscall, perhaps after a stack overflow.

The main target for the patch was embedded real-time systems
where applications have a high degree of determinism. For that
reason, my original proof-of-concept patch was using bitmaps,
which allow for a constant O(1) overhead (correct me if
I'm wrong but I think the current seccomp-filter implementation
introduces an O(n) overhead proportional to the number of system
calls that one wants to allow or prohibit).

However, I realized that it would be too hard to merge with the
current code. I have adapted my original patch which now allows
BPF filters to retrieve information regarding the previous system
call requested by the application.

The patch can be tested on linux-master as follows (tested
on Debian Jessie x86_64):

  $ sudo vi /usr/include/linux/seccomp.h
   ...
   struct seccomp_data {
int nr;
int prev_nr; <-- add this entry
  ...
  $ cd samples/seccomp/
  $ make bpf-prev
  $ ./bpf-prev
parent msgsnd: hello
parent msgrcv after prctl: hello (128 bytes)
parent msgsnd: world
parent msgrcv after msgsnd: world (128 bytes)
parent msgsnd: this is mars
child msgrcv after clone: this is mars (128 bytes)
parent: child 11409 exited with status 0
Should fail: Bad system call

For simplicity, at the moment the patch only records the last
requested system call. Despite being vulnerable to specially-
crafted mimicry attacks, I think it can deter common attacks
specially during the "initial phase" of the attack (e.g.: a 
return-oriented jump).

It could also be extended with longer call sequences (NGRAMs),
call stack and call site information, or scratch memory for
restricting a system call to the application's initalization
for example. However, I'm not sure if such complexity would
be worth. I would like to know at this early stage if any
of you is interested in this type of approach and what you
think about it.

Thanks,
Daniel

[1] Kernel security hacking for the Internet of Things
http://events.linuxfoundation.jp/sites/events/files/slides/linuxcon-2015-daniel-sangorrin-final.pdf
[2] https://github.com/sangorrin/linuxcon-japan-2015/tree/master/hids

Daniel Sangorrin (1):
  seccomp: provide information about the previous syscall

 include/linux/seccomp.h  |   2 +
 include/uapi/linux/seccomp.h |   2 +
 kernel/seccomp.c |  10 +++
 samples/seccomp/.gitignore   |   1 +
 samples/seccomp/Makefile |   9 ++-
 samples/seccomp/bpf-prev.c   | 160 +++
 6 files changed, 183 insertions(+), 1 deletion(-)
 create mode 100644 samples/seccomp/bpf-prev.c

-- 
2.1.4

[RFC PATCH 1/1] seccomp: provide information about the previous syscall

2016-01-21 Thread Daniel Sangorrin

This patch allows applications to restrict the order in which
its system calls may be requested. In order to do that, we
provide seccomp-BPF scripts with information about the
previous system call requested.

An example use case consists of detecting (and stopping) return
oriented attacks that disturb the normal execution flow of
a user program.

Signed-off-by: Daniel Sangorrin 
---
 include/linux/seccomp.h  |   2 +
 include/uapi/linux/seccomp.h |   2 +
 kernel/seccomp.c |  10 +++
 samples/seccomp/.gitignore   |   1 +
 samples/seccomp/Makefile |   9 ++-
 samples/seccomp/bpf-prev.c   | 160 +++
 6 files changed, 183 insertions(+), 1 deletion(-)
 create mode 100644 samples/seccomp/bpf-prev.c

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 2296e6b..8c6de6d 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -16,6 +16,7 @@ struct seccomp_filter;
  *
  * @mode:  indicates one of the valid values above for controlled
  * system calls available to a process.
+ * @prev_nr: stores the previous system call number.
  * @filter: must always point to a valid seccomp-filter or NULL as it is
  *  accessed without locking during system call entry.
  *
@@ -24,6 +25,7 @@ struct seccomp_filter;
  */
 struct seccomp {
int mode;
+   int prev_nr;
struct seccomp_filter *filter;
 };
 
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 0f238a4..42775dc 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -38,6 +38,7 @@
 /**
  * struct seccomp_data - the format the BPF program executes over.
  * @nr: the system call number
+ * @prev_nr: the previous system call number
  * @arch: indicates system call convention as an AUDIT_ARCH_* value
  *as defined in .
  * @instruction_pointer: at the time of the system call.
@@ -46,6 +47,7 @@
  */
 struct seccomp_data {
int nr;
+   int prev_nr;
__u32 arch;
__u64 instruction_pointer;
__u64 args[6];
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 580ac2d..98b2c9d3 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -190,6 +190,8 @@ static u32 seccomp_run_filters(struct seccomp_data *sd)
sd = _local;
}
 
+   sd->prev_nr = current->seccomp.prev_nr;
+
/*
 * All filters in the list are evaluated and the lowest BPF return
 * value always takes priority (ignoring the DATA).
@@ -200,6 +202,9 @@ static u32 seccomp_run_filters(struct seccomp_data *sd)
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
}
+
+   current->seccomp.prev_nr = sd->nr;
+
return ret;
 }
 #endif /* CONFIG_SECCOMP_FILTER */
@@ -443,6 +448,11 @@ static long seccomp_attach_filter(unsigned int flags,
return ret;
}
 
+   /* Initialize the prev_nr field only once */
+   if (current->seccomp.filter == NULL)
+   current->seccomp.prev_nr =
+   syscall_get_nr(current, task_pt_regs(current));
+
/*
 * If there is an existing filter, make it the prev and don't drop its
 * task reference.
diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore
index 78fb781..11dda7a 100644
--- a/samples/seccomp/.gitignore
+++ b/samples/seccomp/.gitignore
@@ -1,3 +1,4 @@
 bpf-direct
 bpf-fancy
 dropper
+bpf-prev
diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
index 1b4e4b8..b50821c 100644
--- a/samples/seccomp/Makefile
+++ b/samples/seccomp/Makefile
@@ -1,7 +1,7 @@
 # kbuild trick to avoid linker error. Can be omitted if a module is built.
 obj- := dummy.o
 
-hostprogs-$(CONFIG_SECCOMP_FILTER) := bpf-fancy dropper bpf-direct
+hostprogs-$(CONFIG_SECCOMP_FILTER) := bpf-fancy dropper bpf-direct bpf-prev
 
 HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include
 HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include
@@ -17,6 +17,11 @@ HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include
 HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include
 bpf-direct-objs := bpf-direct.o
 
+HOSTCFLAGS_bpf-prev.o += -I$(objtree)/usr/include
+HOSTCFLAGS_bpf-prev.o += -idirafter $(objtree)/include
+bpf-prev-objs := bpf-prev.o
+
+
 # Try to match the kernel target.
 ifndef CROSS_COMPILE
 ifndef CONFIG_64BIT
@@ -29,10 +34,12 @@ MFLAG = -m31
 endif
 
 HOSTCFLAGS_bpf-direct.o += $(MFLAG)
+HOSTCFLAGS_bpf-prev.o += $(MFLAG)
 HOSTCFLAGS_dropper.o += $(MFLAG)
 HOSTCFLAGS_bpf-helper.o += $(MFLAG)
 HOSTCFLAGS_bpf-fancy.o += $(MFLAG)
 HOSTLOADLIBES_bpf-direct += $(MFLAG)
+HOSTLOADLIBES_bpf-prev += $(MFLAG)
 HOSTLOADLIBES_bpf-fancy += $(MFLAG)
 HOSTLOADLIBES_dropper += $(MFLAG)
 endif
diff --git a/samples/seccomp/bpf-prev.c b/samples/seccomp/bpf-prev.c
new file mode 100644
index 000..138c584
--- /dev/null
+++ b/samples/seccomp/bpf-prev.c
@@ -0,0 +1,160 @@
+/*
+ *

[RFC PATCH 1/1] seccomp: provide information about the previous syscall

2016-01-21 Thread Daniel Sangorrin

This patch allows applications to restrict the order in which
its system calls may be requested. In order to do that, we
provide seccomp-BPF scripts with information about the
previous system call requested.

An example use case consists of detecting (and stopping) return
oriented attacks that disturb the normal execution flow of
a user program.

Signed-off-by: Daniel Sangorrin <daniel.sangor...@toshiba.co.jp>
---
 include/linux/seccomp.h  |   2 +
 include/uapi/linux/seccomp.h |   2 +
 kernel/seccomp.c |  10 +++
 samples/seccomp/.gitignore   |   1 +
 samples/seccomp/Makefile |   9 ++-
 samples/seccomp/bpf-prev.c   | 160 +++
 6 files changed, 183 insertions(+), 1 deletion(-)
 create mode 100644 samples/seccomp/bpf-prev.c

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 2296e6b..8c6de6d 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -16,6 +16,7 @@ struct seccomp_filter;
  *
  * @mode:  indicates one of the valid values above for controlled
  * system calls available to a process.
+ * @prev_nr: stores the previous system call number.
  * @filter: must always point to a valid seccomp-filter or NULL as it is
  *  accessed without locking during system call entry.
  *
@@ -24,6 +25,7 @@ struct seccomp_filter;
  */
 struct seccomp {
int mode;
+   int prev_nr;
struct seccomp_filter *filter;
 };
 
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 0f238a4..42775dc 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -38,6 +38,7 @@
 /**
  * struct seccomp_data - the format the BPF program executes over.
  * @nr: the system call number
+ * @prev_nr: the previous system call number
  * @arch: indicates system call convention as an AUDIT_ARCH_* value
  *as defined in .
  * @instruction_pointer: at the time of the system call.
@@ -46,6 +47,7 @@
  */
 struct seccomp_data {
int nr;
+   int prev_nr;
__u32 arch;
__u64 instruction_pointer;
__u64 args[6];
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 580ac2d..98b2c9d3 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -190,6 +190,8 @@ static u32 seccomp_run_filters(struct seccomp_data *sd)
sd = _local;
}
 
+   sd->prev_nr = current->seccomp.prev_nr;
+
/*
 * All filters in the list are evaluated and the lowest BPF return
 * value always takes priority (ignoring the DATA).
@@ -200,6 +202,9 @@ static u32 seccomp_run_filters(struct seccomp_data *sd)
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
}
+
+   current->seccomp.prev_nr = sd->nr;
+
return ret;
 }
 #endif /* CONFIG_SECCOMP_FILTER */
@@ -443,6 +448,11 @@ static long seccomp_attach_filter(unsigned int flags,
return ret;
}
 
+   /* Initialize the prev_nr field only once */
+   if (current->seccomp.filter == NULL)
+   current->seccomp.prev_nr =
+   syscall_get_nr(current, task_pt_regs(current));
+
/*
 * If there is an existing filter, make it the prev and don't drop its
 * task reference.
diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore
index 78fb781..11dda7a 100644
--- a/samples/seccomp/.gitignore
+++ b/samples/seccomp/.gitignore
@@ -1,3 +1,4 @@
 bpf-direct
 bpf-fancy
 dropper
+bpf-prev
diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
index 1b4e4b8..b50821c 100644
--- a/samples/seccomp/Makefile
+++ b/samples/seccomp/Makefile
@@ -1,7 +1,7 @@
 # kbuild trick to avoid linker error. Can be omitted if a module is built.
 obj- := dummy.o
 
-hostprogs-$(CONFIG_SECCOMP_FILTER) := bpf-fancy dropper bpf-direct
+hostprogs-$(CONFIG_SECCOMP_FILTER) := bpf-fancy dropper bpf-direct bpf-prev
 
 HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include
 HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include
@@ -17,6 +17,11 @@ HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include
 HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include
 bpf-direct-objs := bpf-direct.o
 
+HOSTCFLAGS_bpf-prev.o += -I$(objtree)/usr/include
+HOSTCFLAGS_bpf-prev.o += -idirafter $(objtree)/include
+bpf-prev-objs := bpf-prev.o
+
+
 # Try to match the kernel target.
 ifndef CROSS_COMPILE
 ifndef CONFIG_64BIT
@@ -29,10 +34,12 @@ MFLAG = -m31
 endif
 
 HOSTCFLAGS_bpf-direct.o += $(MFLAG)
+HOSTCFLAGS_bpf-prev.o += $(MFLAG)
 HOSTCFLAGS_dropper.o += $(MFLAG)
 HOSTCFLAGS_bpf-helper.o += $(MFLAG)
 HOSTCFLAGS_bpf-fancy.o += $(MFLAG)
 HOSTLOADLIBES_bpf-direct += $(MFLAG)
+HOSTLOADLIBES_bpf-prev += $(MFLAG)
 HOSTLOADLIBES_bpf-fancy += $(MFLAG)
 HOSTLOADLIBES_dropper += $(MFLAG)
 endif
diff --git a/samples/seccomp/bpf-prev.c b/samples/seccomp/bpf-prev.c
new file mode 100644
index 000..138c584
--- /dev/null
+++ b/sample

[RFC PATCH 0/1] Adding previous syscall context to seccomp

2016-01-21 Thread Daniel Sangorrin

Hi,

During my presentation last year at Linuxcon Japan [1], I released
a proof-of-concept patch [2] for the seccomp subsystem. The main
purpose of that patch was to let applications restrict the order
in which their system calls are requested. In more technical terms,
a host-based anomaly intrusion detection system (HIDS) that uses call
sequence monitoring for detecting unusual patterns. For example,
to detect when the execution flow unexpectedly diverts towards the
'mprotect' syscall, perhaps after a stack overflow.

The main target for the patch was embedded real-time systems
where applications have a high degree of determinism. For that
reason, my original proof-of-concept patch was using bitmaps,
which allow for a constant O(1) overhead (correct me if
I'm wrong but I think the current seccomp-filter implementation
introduces an O(n) overhead proportional to the number of system
calls that one wants to allow or prohibit).

However, I realized that it would be too hard to merge with the
current code. I have adapted my original patch which now allows
BPF filters to retrieve information regarding the previous system
call requested by the application.

The patch can be tested on linux-master as follows (tested
on Debian Jessie x86_64):

  $ sudo vi /usr/include/linux/seccomp.h
   ...
   struct seccomp_data {
int nr;
int prev_nr; <-- add this entry
  ...
  $ cd samples/seccomp/
  $ make bpf-prev
  $ ./bpf-prev
parent msgsnd: hello
parent msgrcv after prctl: hello (128 bytes)
parent msgsnd: world
parent msgrcv after msgsnd: world (128 bytes)
parent msgsnd: this is mars
child msgrcv after clone: this is mars (128 bytes)
parent: child 11409 exited with status 0
Should fail: Bad system call

For simplicity, at the moment the patch only records the last
requested system call. Despite being vulnerable to specially-
crafted mimicry attacks, I think it can deter common attacks
specially during the "initial phase" of the attack (e.g.: a 
return-oriented jump).

It could also be extended with longer call sequences (NGRAMs),
call stack and call site information, or scratch memory for
restricting a system call to the application's initalization
for example. However, I'm not sure if such complexity would
be worth. I would like to know at this early stage if any
of you is interested in this type of approach and what you
think about it.

Thanks,
Daniel

[1] Kernel security hacking for the Internet of Things
http://events.linuxfoundation.jp/sites/events/files/slides/linuxcon-2015-daniel-sangorrin-final.pdf
[2] https://github.com/sangorrin/linuxcon-japan-2015/tree/master/hids

Daniel Sangorrin (1):
  seccomp: provide information about the previous syscall

 include/linux/seccomp.h  |   2 +
 include/uapi/linux/seccomp.h |   2 +
 kernel/seccomp.c |  10 +++
 samples/seccomp/.gitignore   |   1 +
 samples/seccomp/Makefile |   9 ++-
 samples/seccomp/bpf-prev.c   | 160 +++
 6 files changed, 183 insertions(+), 1 deletion(-)
 create mode 100644 samples/seccomp/bpf-prev.c

-- 
2.1.4

Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce

2014-04-24 Thread Daniel Sangorrin



On 2014/04/25 13:51, Viresh Kumar wrote:
> On 25 April 2014 06:01, Daniel Sangorrin  
> wrote:
>>
>>>> I can't keep it as a separate patch and so would be required to merge
>>>> it into my original patch..
>>>
>>> And the reason being: "No patch is supposed to break things, otherwise
>>> git bisect wouldn't work smoothly".. And so git bisect would complain
>>> this issue after my patch and so I have to merge the fixes you gave into
>>> the original patch as its not yet merged.
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe cgroups" in
>>> the body of a message to majord...@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>> Ok, no problem then.
> 
> Do you want me to add your Tested-by for my next version ?
> 
> 

Sure.

Tested-by: Daniel Sangorrin 

-- 
Toshiba Corporate Software Engineering Center
Daniel SANGORRIN
E-mail:  daniel.sangor...@toshiba.co.jp
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2] timer_stats: add core information to event counters

2014-04-24 Thread Daniel Sangorrin

Add information specifying the CPU core where timer callbacks (events)
were executed to the output of /proc/timer_stats.

Signed-off-by: Daniel Sangorrin 
Signed-off-by: Yoshitake Kobayashi 
---
 Documentation/timers/timer_stats.txt | 43 
 kernel/time/timer_stats.c| 20 +
 2 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/Documentation/timers/timer_stats.txt 
b/Documentation/timers/timer_stats.txt
index 8abd40b..aeb467f 100644
--- a/Documentation/timers/timer_stats.txt
+++ b/Documentation/timers/timer_stats.txt
@@ -21,7 +21,7 @@ Linux system over a sample period:
 - the name of the process which initialized the timer
 - the function where the timer was initialized
 - the callback function which is associated to the timer
-- the number of events (callbacks)
+- the number of events (callbacks) executed per core
 
 timer_stats adds an entry to /proc: /proc/timer_stats
 
@@ -45,24 +45,29 @@ readouts.
 
 Sample output of /proc/timer_stats:
 
-Timerstats sample period: 3.888770 s
-  12, 0 swapper  hrtimer_stop_sched_tick (hrtimer_sched_tick)
-  15, 1 swapper  hcd_submit_urb (rh_timer_func)
-   4,   959 kedacschedule_timeout (process_timeout)
-   1, 0 swapper  page_writeback_init (wb_timer_fn)
-  28, 0 swapper  hrtimer_stop_sched_tick (hrtimer_sched_tick)
-  22,  2948 IRQ 4tty_flip_buffer_push (delayed_work_timer_fn)
-   3,  3100 bash schedule_timeout (process_timeout)
-   1, 1 swapper  queue_delayed_work_on (delayed_work_timer_fn)
-   1, 1 swapper  queue_delayed_work_on (delayed_work_timer_fn)
-   1, 1 swapper  neigh_table_init_no_netlink (neigh_periodic_timer)
-   1,  2292 ip   __netdev_watchdog_up (dev_watchdog)
-   1,23 events/1 do_cache_clean (delayed_work_timer_fn)
-90 total events, 30.0 events/sec
-
-The first column is the number of events, the second column the pid, the third
-column is the name of the process. The forth column shows the function which
-initialized the timer and in parenthesis the callback function which was
+Timer Stats Version: v0.3
+Sample period: 4.365 s
+Collection: inactive
+ 0,   4366, 0 swapper/1tick_setup_sched_timer 
(tick_sched_timer)
+  4366,  0, 1 swapper/0tick_setup_sched_timer 
(tick_sched_timer)
+ 0,  5,  1132 sshd sk_reset_timer (tcp_write_timer)
+ 0,  4,   515 apache2  schedule_hrtimeout_range_clock 
(hrtimer_wakeup)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+ 0,  7, 0 swapper/1sk_reset_timer (tcp_delack_timer)
+ 3,  4, 0 swapper/0clocksource_watchdog 
(clocksource_watchdog)
+0D, 3D,13 kworker/1:0  queue_delayed_work_on 
(delayed_work_timer_fn)
+3D, 0D,22 kworker/0:1  queue_delayed_work_on 
(delayed_work_timer_fn)
+ 1,  0,22 kworker/0:1  e1000_watchdog_task (e1000_watchdog)
+8842 total events, 2025.658 events/sec
+
+The first comma-separated columns represent the number of events (one
+column per present core), the next column the pid, and the next one is the
+name of the process. The last column shows the function which
+initialized the timer and in parentheses the callback function which was
 executed on expiry.
 
 Thomas, Ingo
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1fb08f2..53d1544 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -67,7 +67,7 @@ struct entry {
/*
 * Number of timeout events:
 */
-   unsigned long   count;
+   unsigned long   count[NR_CPUS];
unsigned inttimer_flag;
 
/*
@@ -203,7 +203,7 @@ static struct entry *tstat_lookup(struct entry *entry, char 
*comm)
curr = alloc_entry();
if (curr) {
*curr = *entry;
-   curr->count = 0;
+   memset(curr->count, 0, sizeof(curr->count));
curr->next = NULL;
memcpy(curr->comm, comm, TASK_COMM_LEN);
 
@@ -259,7 +259,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void 
*startf,
 
entry = tstat_lookup(, comm);
if (likely(entry))
-   entry->count++;
+   entry->count[raw_smp_processor_id()]++;
else
atomic_inc(_count);
 
@@ -284,7 +284,7 @@ static int tstats_show(struct seq_file *m, void *v)
unsigned long ms;
long events = 0;
ktime_t time;
-   int

Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce

2014-04-24 Thread Daniel Sangorrin


>> I can't keep it as a separate patch and so would be required to merge
>> it into my original patch..
> 
> And the reason being: "No patch is supposed to break things, otherwise
> git bisect wouldn't work smoothly".. And so git bisect would complain
> this issue after my patch and so I have to merge the fixes you gave into
> the original patch as its not yet merged.
> --
> To unsubscribe from this list: send the line "unsubscribe cgroups" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ok, no problem then.

Thanks,
Daniel

-- 
Toshiba Corporate Software Engineering Center
Daniel SANGORRIN
E-mail:  daniel.sangor...@toshiba.co.jp
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] timer_stats: add core information to event counters

2014-04-24 Thread Daniel Sangorrin


On 2014/04/25 2:54, Randy Dunlap wrote:
> On 04/24/14 00:47, Daniel Sangorrin wrote:
>> Add information specifying the CPU core where timer callbacks (events)
>> were executed to the output of /proc/timer_stats.
>>
>> Signed-off-by: Daniel Sangorrin 
>> Signed-off-by: Yoshitake Kobayashi 
>> ---
>>  Documentation/timers/timer_stats.txt | 41 
>> 
>>  kernel/time/timer_stats.c| 20 ++
>>  2 files changed, 34 insertions(+), 27 deletions(-)
>>
>> diff --git a/Documentation/timers/timer_stats.txt 
>> b/Documentation/timers/timer_stats.txt
>> index 8abd40b..e54077a 100644
>> --- a/Documentation/timers/timer_stats.txt
>> +++ b/Documentation/timers/timer_stats.txt
>> @@ -21,7 +21,7 @@ Linux system over a sample period:
>>  - the name of the process which initialized the timer
>>  - the function where the timer was initialized
>>  - the callback function which is associated to the timer
>> -- the number of events (callbacks)
>> +- the number of events (callbacks) executed per core
>>  
>>  timer_stats adds an entry to /proc: /proc/timer_stats
>>  
>> @@ -45,23 +45,28 @@ readouts.
>>  
>>  Sample output of /proc/timer_stats:
>>  
>> -Timerstats sample period: 3.888770 s
>> -  12, 0 swapper  hrtimer_stop_sched_tick (hrtimer_sched_tick)
>> -  15, 1 swapper  hcd_submit_urb (rh_timer_func)
>> -   4,   959 kedacschedule_timeout (process_timeout)
>> -   1, 0 swapper  page_writeback_init (wb_timer_fn)
>> -  28, 0 swapper  hrtimer_stop_sched_tick (hrtimer_sched_tick)
>> -  22,  2948 IRQ 4tty_flip_buffer_push (delayed_work_timer_fn)
>> -   3,  3100 bash schedule_timeout (process_timeout)
>> -   1, 1 swapper  queue_delayed_work_on (delayed_work_timer_fn)
>> -   1, 1 swapper  queue_delayed_work_on (delayed_work_timer_fn)
>> -   1, 1 swapper  neigh_table_init_no_netlink 
>> (neigh_periodic_timer)
>> -   1,  2292 ip   __netdev_watchdog_up (dev_watchdog)
>> -   1,23 events/1 do_cache_clean (delayed_work_timer_fn)
>> -90 total events, 30.0 events/sec
>> -
>> -The first column is the number of events, the second column the pid, the 
>> third
>> -column is the name of the process. The forth column shows the function which
>> +Timer Stats Version: v0.3
>> +Sample period: 4.365 s
>> +Collection: inactive
>> + 0,   4366, 0 swapper/1tick_setup_sched_timer 
>> (tick_sched_timer)
>> +  4366,  0, 1 swapper/0tick_setup_sched_timer 
>> (tick_sched_timer)
>> + 0,  5,  1132 sshd sk_reset_timer (tcp_write_timer)
>> + 0,  4,   515 apache2  schedule_hrtimeout_range_clock 
>> (hrtimer_wakeup)
>> +16,  0, 0 swapper/0usb_hcd_poll_rh_status 
>> (rh_timer_func)
>> +16,  0, 0 swapper/0usb_hcd_poll_rh_status 
>> (rh_timer_func)
>> +16,  0, 0 swapper/0usb_hcd_poll_rh_status 
>> (rh_timer_func)
>> +16,  0, 0 swapper/0usb_hcd_poll_rh_status 
>> (rh_timer_func)
>> +16,  0, 0 swapper/0usb_hcd_poll_rh_status 
>> (rh_timer_func)
>> + 0,  7, 0 swapper/1sk_reset_timer (tcp_delack_timer)
>> + 3,  4, 0 swapper/0clocksource_watchdog 
>> (clocksource_watchdog)
>> +0D, 3D,13 kworker/1:0  queue_delayed_work_on 
>> (delayed_work_timer_fn)
>> +3D, 0D,22 kworker/0:1  queue_delayed_work_on 
>> (delayed_work_timer_fn)
>> + 1,  0,22 kworker/0:1  e1000_watchdog_task (e1000_watchdog)
>> +8842 total events, 2025.658 events/sec
>> +
> 
> Is the number of events in decimal or hex?  If in decimal, what do
> 0D and 3D mean?

It is in decimal notation. The 'D' means 'deferrable timer' and it is
already described in timer_stats.txt.

>> +The first comma-separated columns represent the number of events (one
>> +column per present core), the next column the pid, and the next one is the
>> +name of the process. The last column shows the function which
>>  initialized the timer and in parenthesis the callback function which was
> 
> parentheses

Well spotted. I'll fix it and send it again.

Thanks,
Daniel

-- 
Toshiba Corporate Software Engineering Center
Daniel SANGORRIN
E-mail:  daniel.sangor...@toshiba.co.jp
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce

2014-04-24 Thread Daniel Sangorrin

On 2014/04/24 17:41, Viresh Kumar wrote:
> On 24 April 2014 14:01, Daniel Sangorrin  
> wrote:
>> In kernel/cpuset.c:quiesce_cpuset() you are using the function
>> 'smp_call_function_any' which asks CPU cores in 'cpumask' to
>> execute the functions 'hrtimer_quiesce_cpu' and 'timer_quiesce_cpu'.
>>
>> In the case above, 'cpumask' corresponds to core 0. Since I'm forcing
>> the call to be executed from core 1 (by using taskset),
>> an inter-processor interrupt is sent to core 0 for those functions
>> to be executed.
> 
> Ahh, I understood that now :) .. So we are setting cpuset.quiesce from CPU1
> which will do a IPI to get migrate_timers called on CPU0.. I was setting 
> quiesce
> from CPU0 only in my tests :)
> 
> But how does this work fine on x86 then? There we should have exactly same
> problem, isn't it?

Yeah, I'm not sure why it is working on 3.15 x86_64 but not in 3.10 x86_64.
Perhaps it's related to this patch: https://lkml.org/lkml/2013/9/19/349

>> Ok, thank you! I see that you have already fixed the problem. I tested
>> your tree on ARM and now it seems to work correctly.
> 
> Yeah, I just pushed your changes as well at the time I wrote last mail :)

Oh, I see!

Why didn't you just apply the patch on top of your tree so that the
information included in the git commit (e.g: my name and mail) remains?

This part:

cpuset: quiesce: change irq disable/enable by irq save/restore

The function __migrate_timers can be called under interrupt context
or thread context depending on the core where the system call was
executed. In case it executes under interrupt context, it
seems a bad idea to leave interrupts enabled after migrating the
timers. In fact, this caused kernel errors on the ARM architecture and
on the x86_64 architecture with the 3.10 kernel (backported version
of the cpuset-quiesce patch).

Signed-off-by: Daniel Sangorrin 
Signed-off-by: Yoshitake Kobayashi 

Thanks,
Daniel

-- 
Toshiba Corporate Software Engineering Center
Daniel SANGORRIN
E-mail:  daniel.sangor...@toshiba.co.jp
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce

2014-04-24 Thread Daniel Sangorrin


On 2014/04/24 16:43, Viresh Kumar wrote:
> On 24 April 2014 12:55, Daniel Sangorrin  
> wrote:
>> I tried your set of patches for isolating particular CPU cores from unpinned
>> timers. On x86_64 they were working fine, however I found out that on ARM
>> they would fail under the following test:
> 
> I am happy that these drew attention from somebody Atleast :)

Thanks to you for your hard work.

>> # mount -t cpuset none /cpuset
>> # cd /cpuset
>> # mkdir rt
>> # cd rt
>> # echo 1 > cpus
>> # echo 1 > cpu_exclusive
>> # cd
>> # taskset 0x2 ./setquiesce.sh  <--- contains "echo 1 > /cpuset/rt/quiesce"
>> [   75.622375] [ cut here ]
>> [   75.627258] WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:2595 
>> __migrate_hrtimers+0x17c/0x1bc()
>> [   75.636840] DEBUG_LOCKS_WARN_ON(current->hardirq_context)
>> [   75.636840] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 
>> 3.14.0-rc1-37710-g23c8f02 #1
>> [   75.649627] [] (unwind_backtrace) from [] 
>> (show_stack+0x10/0x14)
>> [   75.649627] [] (show_stack) from [] 
>> (dump_stack+0x78/0x94)
>> [   75.662689] [] (dump_stack) from [] 
>> (warn_slowpath_common+0x60/0x84)
>> [   75.670410] [] (warn_slowpath_common) from [] 
>> (warn_slowpath_fmt+0x30/0x40)
>> [   75.677673] [] (warn_slowpath_fmt) from [] 
>> (__migrate_hrtimers+0x17c/0x1bc)
>> [   75.677673] [] (__migrate_hrtimers) from [] 
>> (generic_smp_call_function_single_interrupt+0x8c/0x104)
>> [   75.699645] [] (generic_smp_call_function_single_interrupt) 
>> from [] (handle_IPI+0xa4/0x16c)
>> [   75.706970] [] (handle_IPI) from [] 
>> (gic_handle_irq+0x54/0x5c)
>> [   75.715087] [] (gic_handle_irq) from [] 
>> (__irq_svc+0x44/0x5c)
>> [   75.725311] Exception stack(0xc08a3f58 to 0xc08a3fa0)
> 
> I couldn't understand why we went via a interrupt here ? Probably CPU1
> was idle and was woken up with a IPI and then this happened. But in
> that case too,
> shouldn't the script run from process context instead ?

In kernel/cpuset.c:quiesce_cpuset() you are using the function
'smp_call_function_any' which asks CPU cores in 'cpumask' to
execute the functions 'hrtimer_quiesce_cpu' and 'timer_quiesce_cpu'.

In the case above, 'cpumask' corresponds to core 0. Since I'm forcing
the call to be executed from core 1 (by using taskset),
an inter-processor interrupt is sent to core 0 for those functions
to be executed.

>> I also backported your patches to Linux 3.10.y and found the same problem
>> both in ARM and x86_64.
> 
> There are very few changes in between 3.10 and latest for timers/hrtimers
> and so things are expected to be the same.
> 
>> However, I think I figured out the reason for those
>> errors. Please, could you check the patch below (it applies on the top of
>> your tree, branch isolate-cpusets) and let me know what you think?
> 
> Okay, just to let you know, I have also found some issues and they are
> now pushed in my tree.. Also it is rebased over 3.15-rc2 now.

Ok, thank you! I see that you have already fixed the problem. I tested
your tree on ARM and now it seems to work correctly.



> 
>> -PATCH STARTS HERE-
>> cpuset: quiesce: change irq disable/enable by irq save/restore
>>
>> The function __migrate_timers can be called under interrupt context
>> or thread context depending on the core where the system call was
>> executed. In case it executes under interrupt context, it
> 
> How exactly?

See my reply above.

>> seems a bad idea to leave interrupts enabled after migrating the
>> timers. In fact, this caused kernel errors on the ARM architecture and
>> on the x86_64 architecture with the 3.10 kernel (backported version
>> of the cpuset-quiesce patch).
> 
> I can't keep it as a separate patch and so would be required to merge
> it into my original patch..
> 
> Thanks for your inputs :)
> --
> To unsubscribe from this list: send the line "unsubscribe cgroups" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

Thanks,
Daniel

-- 
Toshiba Corporate Software Engineering Center
Daniel SANGORRIN
E-mail:  daniel.sangor...@toshiba.co.jp
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] timer_stats: add core information to event counters

2014-04-24 Thread Daniel Sangorrin

Hi,

This patch adds information specifying the CPU core where timer 
callbacks (events) were executed to the output of /proc/timer_stats.
I tested the patch on a dual-core x86_64 and another dual-core 
ARM board. This patch may be useful for debugging timer execution
in multi-core environments.

Daniel Sangorrin (1):
  timer_stats: add core information to event counters

 Documentation/timers/timer_stats.txt | 41 
 kernel/time/timer_stats.c| 20 ++
 2 files changed, 34 insertions(+), 27 deletions(-)

-- 
1.8.1.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] timer_stats: add core information to event counters

2014-04-24 Thread Daniel Sangorrin

Add information specifying the CPU core where timer callbacks (events)
were executed to the output of /proc/timer_stats.

Signed-off-by: Daniel Sangorrin 
Signed-off-by: Yoshitake Kobayashi 
---
 Documentation/timers/timer_stats.txt | 41 
 kernel/time/timer_stats.c| 20 ++
 2 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/Documentation/timers/timer_stats.txt 
b/Documentation/timers/timer_stats.txt
index 8abd40b..e54077a 100644
--- a/Documentation/timers/timer_stats.txt
+++ b/Documentation/timers/timer_stats.txt
@@ -21,7 +21,7 @@ Linux system over a sample period:
 - the name of the process which initialized the timer
 - the function where the timer was initialized
 - the callback function which is associated to the timer
-- the number of events (callbacks)
+- the number of events (callbacks) executed per core
 
 timer_stats adds an entry to /proc: /proc/timer_stats
 
@@ -45,23 +45,28 @@ readouts.
 
 Sample output of /proc/timer_stats:
 
-Timerstats sample period: 3.888770 s
-  12, 0 swapper  hrtimer_stop_sched_tick (hrtimer_sched_tick)
-  15, 1 swapper  hcd_submit_urb (rh_timer_func)
-   4,   959 kedacschedule_timeout (process_timeout)
-   1, 0 swapper  page_writeback_init (wb_timer_fn)
-  28, 0 swapper  hrtimer_stop_sched_tick (hrtimer_sched_tick)
-  22,  2948 IRQ 4tty_flip_buffer_push (delayed_work_timer_fn)
-   3,  3100 bash schedule_timeout (process_timeout)
-   1, 1 swapper  queue_delayed_work_on (delayed_work_timer_fn)
-   1, 1 swapper  queue_delayed_work_on (delayed_work_timer_fn)
-   1, 1 swapper  neigh_table_init_no_netlink (neigh_periodic_timer)
-   1,  2292 ip   __netdev_watchdog_up (dev_watchdog)
-   1,23 events/1 do_cache_clean (delayed_work_timer_fn)
-90 total events, 30.0 events/sec
-
-The first column is the number of events, the second column the pid, the third
-column is the name of the process. The forth column shows the function which
+Timer Stats Version: v0.3
+Sample period: 4.365 s
+Collection: inactive
+ 0,   4366, 0 swapper/1tick_setup_sched_timer 
(tick_sched_timer)
+  4366,  0, 1 swapper/0tick_setup_sched_timer 
(tick_sched_timer)
+ 0,  5,  1132 sshd sk_reset_timer (tcp_write_timer)
+ 0,  4,   515 apache2  schedule_hrtimeout_range_clock 
(hrtimer_wakeup)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+ 0,  7, 0 swapper/1sk_reset_timer (tcp_delack_timer)
+ 3,  4, 0 swapper/0clocksource_watchdog 
(clocksource_watchdog)
+0D, 3D,13 kworker/1:0  queue_delayed_work_on 
(delayed_work_timer_fn)
+3D, 0D,22 kworker/0:1  queue_delayed_work_on 
(delayed_work_timer_fn)
+ 1,  0,22 kworker/0:1  e1000_watchdog_task (e1000_watchdog)
+8842 total events, 2025.658 events/sec
+
+The first comma-separated columns represent the number of events (one
+column per present core), the next column the pid, and the next one is the
+name of the process. The last column shows the function which
 initialized the timer and in parenthesis the callback function which was
 executed on expiry.
 
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1fb08f2..53d1544 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -67,7 +67,7 @@ struct entry {
/*
 * Number of timeout events:
 */
-   unsigned long   count;
+   unsigned long   count[NR_CPUS];
unsigned inttimer_flag;
 
/*
@@ -203,7 +203,7 @@ static struct entry *tstat_lookup(struct entry *entry, char 
*comm)
curr = alloc_entry();
if (curr) {
*curr = *entry;
-   curr->count = 0;
+   memset(curr->count, 0, sizeof(curr->count));
curr->next = NULL;
memcpy(curr->comm, comm, TASK_COMM_LEN);
 
@@ -259,7 +259,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void 
*startf,
 
entry = tstat_lookup(, comm);
if (likely(entry))
-   entry->count++;
+   entry->count[raw_smp_processor_id()]++;
else
atomic_inc(_count);
 
@@ -284,7 +284,7 @@ static int tstats_show(struct seq_file *m, void *v)
unsigned long ms;
long events = 0;
ktime_t time;
-   int i;
+   int i, cpu;
 
mutex_lock(_mutex);
/*
@@ -307,19 +307,21 @@ static int tstats

Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce

2014-04-24 Thread Daniel Sangorrin

Dear Viresh Kumar,

I tried your set of patches for isolating particular CPU cores from unpinned
timers. On x86_64 they were working fine, however I found out that on ARM
they would fail under the following test:

# mount -t cpuset none /cpuset
# cd /cpuset
# mkdir rt
# cd rt
# echo 1 > cpus
# echo 1 > cpu_exclusive
# cd
# taskset 0x2 ./setquiesce.sh  <--- contains "echo 1 > /cpuset/rt/quiesce"
[   75.622375] [ cut here ]
[   75.627258] WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:2595 
__migrate_hrtimers+0x17c/0x1bc()
[   75.636840] DEBUG_LOCKS_WARN_ON(current->hardirq_context)
[   75.636840] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 
3.14.0-rc1-37710-g23c8f02 #1
[   75.649627] [] (unwind_backtrace) from [] 
(show_stack+0x10/0x14)
[   75.649627] [] (show_stack) from [] 
(dump_stack+0x78/0x94)
[   75.662689] [] (dump_stack) from [] 
(warn_slowpath_common+0x60/0x84)
[   75.670410] [] (warn_slowpath_common) from [] 
(warn_slowpath_fmt+0x30/0x40)
[   75.677673] [] (warn_slowpath_fmt) from [] 
(__migrate_hrtimers+0x17c/0x1bc)
[   75.677673] [] (__migrate_hrtimers) from [] 
(generic_smp_call_function_single_interrupt+0x8c/0x104)
[   75.699645] [] (generic_smp_call_function_single_interrupt) from 
[] (handle_IPI+0xa4/0x16c)
[   75.706970] [] (handle_IPI) from [] 
(gic_handle_irq+0x54/0x5c)
[   75.715087] [] (gic_handle_irq) from [] 
(__irq_svc+0x44/0x5c)
[   75.725311] Exception stack(0xc08a3f58 to 0xc08a3fa0)


I also backported your patches to Linux 3.10.y and found the same problem
both in ARM and x86_64. However, I think I figured out the reason for those
errors. Please, could you check the patch below (it applies on the top of
your tree, branch isolate-cpusets) and let me know what you think?

Thanks,
Daniel

-PATCH STARTS HERE-
cpuset: quiesce: change irq disable/enable by irq save/restore

The function __migrate_timers can be called under interrupt context
or thread context depending on the core where the system call was
executed. In case it executes under interrupt context, it
seems a bad idea to leave interrupts enabled after migrating the
timers. In fact, this caused kernel errors on the ARM architecture and
on the x86_64 architecture with the 3.10 kernel (backported version
of the cpuset-quiesce patch).

Signed-off-by: Daniel Sangorrin 
Signed-off-by: Yoshitake Kobayashi 
---
 kernel/hrtimer.c | 5 +++--
 kernel/timer.c   | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e8cd1db..abb1707 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1701,8 +1701,9 @@ static void __migrate_hrtimers(int scpu, bool 
remove_pinned)
struct hrtimer_clock_base *clock_base;
unsigned int active_bases;
int i;
+   unsigned long flags;

-   local_irq_disable();
+   local_irq_save(flags);
old_base = _cpu(hrtimer_bases, scpu);
new_base = &__get_cpu_var(hrtimer_bases);
/*
@@ -1722,7 +1723,7 @@ static void __migrate_hrtimers(int scpu, bool 
remove_pinned)

/* Check, if we got expired work to do */
__hrtimer_peek_ahead_timers();
-   local_irq_enable();
+   local_irq_restore(flags);
 }
 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_CPUSETS */

diff --git a/kernel/timer.c b/kernel/timer.c
index 4676a07..2715b7d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1644,6 +1644,7 @@ static void __migrate_timers(int cpu, bool remove_pinned)
struct tvec_base *old_base;
struct tvec_base *new_base;
int i;
+   unsigned long flags;

old_base = per_cpu(tvec_bases, cpu);
new_base = get_cpu_var(tvec_bases);
@@ -1651,7 +1652,7 @@ static void __migrate_timers(int cpu, bool remove_pinned)
 * The caller is globally serialized and nobody else
 * takes two locks at once, deadlock is not possible.
 */
-   spin_lock_irq(_base->lock);
+   spin_lock_irqsave(_base->lock, flags);
spin_lock_nested(_base->lock, SINGLE_DEPTH_NESTING);

BUG_ON(old_base->running_timer);
@@ -1671,7 +1672,7 @@ static void __migrate_timers(int cpu, bool remove_pinned)
}

spin_unlock(_base->lock);
-   spin_unlock_irq(_base->lock);
+   spin_unlock_irqrestore(_base->lock, flags);
put_cpu_var(tvec_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_CPUSETS */
-- 
1.8.1.1
-PATCH ENDS HERE-

On 2014/03/20 22:48, Viresh Kumar wrote:
> For networking platforms we need to provide one isolated CPU per user space 
> data
> plane thread. These CPUs should not be interrupted by kernel at all unless
> userspace has requested for some syscalls. And so must live in isolated mode.
> Currently, there are background kernel activities that are running on almost
> every CPU, like: timers/hrtimers/watchdogs/etc, and these a

Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce

2014-04-24 Thread Daniel Sangorrin

Dear Viresh Kumar,

I tried your set of patches for isolating particular CPU cores from unpinned
timers. On x86_64 they were working fine, however I found out that on ARM
they would fail under the following test:

# mount -t cpuset none /cpuset
# cd /cpuset
# mkdir rt
# cd rt
# echo 1  cpus
# echo 1  cpu_exclusive
# cd
# taskset 0x2 ./setquiesce.sh  --- contains echo 1  /cpuset/rt/quiesce
[   75.622375] [ cut here ]
[   75.627258] WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:2595 
__migrate_hrtimers+0x17c/0x1bc()
[   75.636840] DEBUG_LOCKS_WARN_ON(current-hardirq_context)
[   75.636840] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 
3.14.0-rc1-37710-g23c8f02 #1
[   75.649627] [c0014d18] (unwind_backtrace) from [c00119e8] 
(show_stack+0x10/0x14)
[   75.649627] [c00119e8] (show_stack) from [c065b61c] 
(dump_stack+0x78/0x94)
[   75.662689] [c065b61c] (dump_stack) from [c003e9a4] 
(warn_slowpath_common+0x60/0x84)
[   75.670410] [c003e9a4] (warn_slowpath_common) from [c003ea24] 
(warn_slowpath_fmt+0x30/0x40)
[   75.677673] [c003ea24] (warn_slowpath_fmt) from [c005d7b0] 
(__migrate_hrtimers+0x17c/0x1bc)
[   75.677673] [c005d7b0] (__migrate_hrtimers) from [c009e004] 
(generic_smp_call_function_single_interrupt+0x8c/0x104)
[   75.699645] [c009e004] (generic_smp_call_function_single_interrupt) from 
[c00134d0] (handle_IPI+0xa4/0x16c)
[   75.706970] [c00134d0] (handle_IPI) from [c0008614] 
(gic_handle_irq+0x54/0x5c)
[   75.715087] [c0008614] (gic_handle_irq) from [c0012624] 
(__irq_svc+0x44/0x5c)
[   75.725311] Exception stack(0xc08a3f58 to 0xc08a3fa0)


I also backported your patches to Linux 3.10.y and found the same problem
both in ARM and x86_64. However, I think I figured out the reason for those
errors. Please, could you check the patch below (it applies on the top of
your tree, branch isolate-cpusets) and let me know what you think?

Thanks,
Daniel

-PATCH STARTS HERE-
cpuset: quiesce: change irq disable/enable by irq save/restore

The function __migrate_timers can be called under interrupt context
or thread context depending on the core where the system call was
executed. In case it executes under interrupt context, it
seems a bad idea to leave interrupts enabled after migrating the
timers. In fact, this caused kernel errors on the ARM architecture and
on the x86_64 architecture with the 3.10 kernel (backported version
of the cpuset-quiesce patch).

Signed-off-by: Daniel Sangorrin daniel.sangor...@toshiba.co.jp
Signed-off-by: Yoshitake Kobayashi yoshitake.kobaya...@toshiba.co.jp
---
 kernel/hrtimer.c | 5 +++--
 kernel/timer.c   | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e8cd1db..abb1707 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1701,8 +1701,9 @@ static void __migrate_hrtimers(int scpu, bool 
remove_pinned)
struct hrtimer_clock_base *clock_base;
unsigned int active_bases;
int i;
+   unsigned long flags;

-   local_irq_disable();
+   local_irq_save(flags);
old_base = per_cpu(hrtimer_bases, scpu);
new_base = __get_cpu_var(hrtimer_bases);
/*
@@ -1722,7 +1723,7 @@ static void __migrate_hrtimers(int scpu, bool 
remove_pinned)

/* Check, if we got expired work to do */
__hrtimer_peek_ahead_timers();
-   local_irq_enable();
+   local_irq_restore(flags);
 }
 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_CPUSETS */

diff --git a/kernel/timer.c b/kernel/timer.c
index 4676a07..2715b7d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1644,6 +1644,7 @@ static void __migrate_timers(int cpu, bool remove_pinned)
struct tvec_base *old_base;
struct tvec_base *new_base;
int i;
+   unsigned long flags;

old_base = per_cpu(tvec_bases, cpu);
new_base = get_cpu_var(tvec_bases);
@@ -1651,7 +1652,7 @@ static void __migrate_timers(int cpu, bool remove_pinned)
 * The caller is globally serialized and nobody else
 * takes two locks at once, deadlock is not possible.
 */
-   spin_lock_irq(new_base-lock);
+   spin_lock_irqsave(new_base-lock, flags);
spin_lock_nested(old_base-lock, SINGLE_DEPTH_NESTING);

BUG_ON(old_base-running_timer);
@@ -1671,7 +1672,7 @@ static void __migrate_timers(int cpu, bool remove_pinned)
}

spin_unlock(old_base-lock);
-   spin_unlock_irq(new_base-lock);
+   spin_unlock_irqrestore(new_base-lock, flags);
put_cpu_var(tvec_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_CPUSETS */
-- 
1.8.1.1
-PATCH ENDS HERE-

On 2014/03/20 22:48, Viresh Kumar wrote:
 For networking platforms we need to provide one isolated CPU per user space 
 data
 plane thread. These CPUs should not be interrupted by kernel at all unless
 userspace has requested for some syscalls. And so must live in isolated mode

[PATCH] timer_stats: add core information to event counters

2014-04-24 Thread Daniel Sangorrin

Hi,

This patch adds information specifying the CPU core where timer 
callbacks (events) were executed to the output of /proc/timer_stats.
I tested the patch on a dual-core x86_64 and another dual-core 
ARM board. This patch may be useful for debugging timer execution
in multi-core environments.

Daniel Sangorrin (1):
  timer_stats: add core information to event counters

 Documentation/timers/timer_stats.txt | 41 
 kernel/time/timer_stats.c| 20 ++
 2 files changed, 34 insertions(+), 27 deletions(-)

-- 
1.8.1.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] timer_stats: add core information to event counters

2014-04-24 Thread Daniel Sangorrin

Add information specifying the CPU core where timer callbacks (events)
were executed to the output of /proc/timer_stats.

Signed-off-by: Daniel Sangorrin daniel.sangor...@toshiba.co.jp
Signed-off-by: Yoshitake Kobayashi yoshitake.kobaya...@toshiba.co.jp
---
 Documentation/timers/timer_stats.txt | 41 
 kernel/time/timer_stats.c| 20 ++
 2 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/Documentation/timers/timer_stats.txt 
b/Documentation/timers/timer_stats.txt
index 8abd40b..e54077a 100644
--- a/Documentation/timers/timer_stats.txt
+++ b/Documentation/timers/timer_stats.txt
@@ -21,7 +21,7 @@ Linux system over a sample period:
 - the name of the process which initialized the timer
 - the function where the timer was initialized
 - the callback function which is associated to the timer
-- the number of events (callbacks)
+- the number of events (callbacks) executed per core
 
 timer_stats adds an entry to /proc: /proc/timer_stats
 
@@ -45,23 +45,28 @@ readouts.
 
 Sample output of /proc/timer_stats:
 
-Timerstats sample period: 3.888770 s
-  12, 0 swapper  hrtimer_stop_sched_tick (hrtimer_sched_tick)
-  15, 1 swapper  hcd_submit_urb (rh_timer_func)
-   4,   959 kedacschedule_timeout (process_timeout)
-   1, 0 swapper  page_writeback_init (wb_timer_fn)
-  28, 0 swapper  hrtimer_stop_sched_tick (hrtimer_sched_tick)
-  22,  2948 IRQ 4tty_flip_buffer_push (delayed_work_timer_fn)
-   3,  3100 bash schedule_timeout (process_timeout)
-   1, 1 swapper  queue_delayed_work_on (delayed_work_timer_fn)
-   1, 1 swapper  queue_delayed_work_on (delayed_work_timer_fn)
-   1, 1 swapper  neigh_table_init_no_netlink (neigh_periodic_timer)
-   1,  2292 ip   __netdev_watchdog_up (dev_watchdog)
-   1,23 events/1 do_cache_clean (delayed_work_timer_fn)
-90 total events, 30.0 events/sec
-
-The first column is the number of events, the second column the pid, the third
-column is the name of the process. The forth column shows the function which
+Timer Stats Version: v0.3
+Sample period: 4.365 s
+Collection: inactive
+ 0,   4366, 0 swapper/1tick_setup_sched_timer 
(tick_sched_timer)
+  4366,  0, 1 swapper/0tick_setup_sched_timer 
(tick_sched_timer)
+ 0,  5,  1132 sshd sk_reset_timer (tcp_write_timer)
+ 0,  4,   515 apache2  schedule_hrtimeout_range_clock 
(hrtimer_wakeup)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+ 0,  7, 0 swapper/1sk_reset_timer (tcp_delack_timer)
+ 3,  4, 0 swapper/0clocksource_watchdog 
(clocksource_watchdog)
+0D, 3D,13 kworker/1:0  queue_delayed_work_on 
(delayed_work_timer_fn)
+3D, 0D,22 kworker/0:1  queue_delayed_work_on 
(delayed_work_timer_fn)
+ 1,  0,22 kworker/0:1  e1000_watchdog_task (e1000_watchdog)
+8842 total events, 2025.658 events/sec
+
+The first comma-separated columns represent the number of events (one
+column per present core), the next column the pid, and the next one is the
+name of the process. The last column shows the function which
 initialized the timer and in parenthesis the callback function which was
 executed on expiry.
 
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1fb08f2..53d1544 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -67,7 +67,7 @@ struct entry {
/*
 * Number of timeout events:
 */
-   unsigned long   count;
+   unsigned long   count[NR_CPUS];
unsigned inttimer_flag;
 
/*
@@ -203,7 +203,7 @@ static struct entry *tstat_lookup(struct entry *entry, char 
*comm)
curr = alloc_entry();
if (curr) {
*curr = *entry;
-   curr-count = 0;
+   memset(curr-count, 0, sizeof(curr-count));
curr-next = NULL;
memcpy(curr-comm, comm, TASK_COMM_LEN);
 
@@ -259,7 +259,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void 
*startf,
 
entry = tstat_lookup(input, comm);
if (likely(entry))
-   entry-count++;
+   entry-count[raw_smp_processor_id()]++;
else
atomic_inc(overflow_count);
 
@@ -284,7 +284,7 @@ static int tstats_show(struct seq_file *m, void *v)
unsigned long ms;
long events = 0;
ktime_t time;
-   int i;
+   int i, cpu;
 
mutex_lock(show_mutex

Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce

2014-04-24 Thread Daniel Sangorrin


On 2014/04/24 16:43, Viresh Kumar wrote:
 On 24 April 2014 12:55, Daniel Sangorrin daniel.sangor...@toshiba.co.jp 
 wrote:
 I tried your set of patches for isolating particular CPU cores from unpinned
 timers. On x86_64 they were working fine, however I found out that on ARM
 they would fail under the following test:
 
 I am happy that these drew attention from somebody Atleast :)

Thanks to you for your hard work.

 # mount -t cpuset none /cpuset
 # cd /cpuset
 # mkdir rt
 # cd rt
 # echo 1  cpus
 # echo 1  cpu_exclusive
 # cd
 # taskset 0x2 ./setquiesce.sh  --- contains echo 1  /cpuset/rt/quiesce
 [   75.622375] [ cut here ]
 [   75.627258] WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:2595 
 __migrate_hrtimers+0x17c/0x1bc()
 [   75.636840] DEBUG_LOCKS_WARN_ON(current-hardirq_context)
 [   75.636840] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 
 3.14.0-rc1-37710-g23c8f02 #1
 [   75.649627] [c0014d18] (unwind_backtrace) from [c00119e8] 
 (show_stack+0x10/0x14)
 [   75.649627] [c00119e8] (show_stack) from [c065b61c] 
 (dump_stack+0x78/0x94)
 [   75.662689] [c065b61c] (dump_stack) from [c003e9a4] 
 (warn_slowpath_common+0x60/0x84)
 [   75.670410] [c003e9a4] (warn_slowpath_common) from [c003ea24] 
 (warn_slowpath_fmt+0x30/0x40)
 [   75.677673] [c003ea24] (warn_slowpath_fmt) from [c005d7b0] 
 (__migrate_hrtimers+0x17c/0x1bc)
 [   75.677673] [c005d7b0] (__migrate_hrtimers) from [c009e004] 
 (generic_smp_call_function_single_interrupt+0x8c/0x104)
 [   75.699645] [c009e004] (generic_smp_call_function_single_interrupt) 
 from [c00134d0] (handle_IPI+0xa4/0x16c)
 [   75.706970] [c00134d0] (handle_IPI) from [c0008614] 
 (gic_handle_irq+0x54/0x5c)
 [   75.715087] [c0008614] (gic_handle_irq) from [c0012624] 
 (__irq_svc+0x44/0x5c)
 [   75.725311] Exception stack(0xc08a3f58 to 0xc08a3fa0)
 
 I couldn't understand why we went via a interrupt here ? Probably CPU1
 was idle and was woken up with a IPI and then this happened. But in
 that case too,
 shouldn't the script run from process context instead ?

In kernel/cpuset.c:quiesce_cpuset() you are using the function
'smp_call_function_any' which asks CPU cores in 'cpumask' to
execute the functions 'hrtimer_quiesce_cpu' and 'timer_quiesce_cpu'.

In the case above, 'cpumask' corresponds to core 0. Since I'm forcing
the call to be executed from core 1 (by using taskset),
an inter-processor interrupt is sent to core 0 for those functions
to be executed.

 I also backported your patches to Linux 3.10.y and found the same problem
 both in ARM and x86_64.
 
 There are very few changes in between 3.10 and latest for timers/hrtimers
 and so things are expected to be the same.
 
 However, I think I figured out the reason for those
 errors. Please, could you check the patch below (it applies on the top of
 your tree, branch isolate-cpusets) and let me know what you think?
 
 Okay, just to let you know, I have also found some issues and they are
 now pushed in my tree.. Also it is rebased over 3.15-rc2 now.

Ok, thank you! I see that you have already fixed the problem. I tested
your tree on ARM and now it seems to work correctly.



 
 -PATCH STARTS HERE-
 cpuset: quiesce: change irq disable/enable by irq save/restore

 The function __migrate_timers can be called under interrupt context
 or thread context depending on the core where the system call was
 executed. In case it executes under interrupt context, it
 
 How exactly?

See my reply above.

 seems a bad idea to leave interrupts enabled after migrating the
 timers. In fact, this caused kernel errors on the ARM architecture and
 on the x86_64 architecture with the 3.10 kernel (backported version
 of the cpuset-quiesce patch).
 
 I can't keep it as a separate patch and so would be required to merge
 it into my original patch..
 
 Thanks for your inputs :)
 --
 To unsubscribe from this list: send the line unsubscribe cgroups in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 

Thanks,
Daniel

-- 
Toshiba Corporate Software Engineering Center
Daniel SANGORRIN
E-mail:  daniel.sangor...@toshiba.co.jp
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce

2014-04-24 Thread Daniel Sangorrin

On 2014/04/24 17:41, Viresh Kumar wrote:
 On 24 April 2014 14:01, Daniel Sangorrin daniel.sangor...@toshiba.co.jp 
 wrote:
 In kernel/cpuset.c:quiesce_cpuset() you are using the function
 'smp_call_function_any' which asks CPU cores in 'cpumask' to
 execute the functions 'hrtimer_quiesce_cpu' and 'timer_quiesce_cpu'.

 In the case above, 'cpumask' corresponds to core 0. Since I'm forcing
 the call to be executed from core 1 (by using taskset),
 an inter-processor interrupt is sent to core 0 for those functions
 to be executed.
 
 Ahh, I understood that now :) .. So we are setting cpuset.quiesce from CPU1
 which will do a IPI to get migrate_timers called on CPU0.. I was setting 
 quiesce
 from CPU0 only in my tests :)
 
 But how does this work fine on x86 then? There we should have exactly same
 problem, isn't it?

Yeah, I'm not sure why it is working on 3.15 x86_64 but not in 3.10 x86_64.
Perhaps it's related to this patch: https://lkml.org/lkml/2013/9/19/349

 Ok, thank you! I see that you have already fixed the problem. I tested
 your tree on ARM and now it seems to work correctly.
 
 Yeah, I just pushed your changes as well at the time I wrote last mail :)

Oh, I see!

Why didn't you just apply the patch on top of your tree so that the
information included in the git commit (e.g: my name and mail) remains?

This part:

cpuset: quiesce: change irq disable/enable by irq save/restore

The function __migrate_timers can be called under interrupt context
or thread context depending on the core where the system call was
executed. In case it executes under interrupt context, it
seems a bad idea to leave interrupts enabled after migrating the
timers. In fact, this caused kernel errors on the ARM architecture and
on the x86_64 architecture with the 3.10 kernel (backported version
of the cpuset-quiesce patch).

Signed-off-by: Daniel Sangorrin daniel.sangor...@toshiba.co.jp
Signed-off-by: Yoshitake Kobayashi yoshitake.kobaya...@toshiba.co.jp

Thanks,
Daniel

-- 
Toshiba Corporate Software Engineering Center
Daniel SANGORRIN
E-mail:  daniel.sangor...@toshiba.co.jp
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] timer_stats: add core information to event counters

2014-04-24 Thread Daniel Sangorrin


On 2014/04/25 2:54, Randy Dunlap wrote:
 On 04/24/14 00:47, Daniel Sangorrin wrote:
 Add information specifying the CPU core where timer callbacks (events)
 were executed to the output of /proc/timer_stats.

 Signed-off-by: Daniel Sangorrin daniel.sangor...@toshiba.co.jp
 Signed-off-by: Yoshitake Kobayashi yoshitake.kobaya...@toshiba.co.jp
 ---
  Documentation/timers/timer_stats.txt | 41 
 
  kernel/time/timer_stats.c| 20 ++
  2 files changed, 34 insertions(+), 27 deletions(-)

 diff --git a/Documentation/timers/timer_stats.txt 
 b/Documentation/timers/timer_stats.txt
 index 8abd40b..e54077a 100644
 --- a/Documentation/timers/timer_stats.txt
 +++ b/Documentation/timers/timer_stats.txt
 @@ -21,7 +21,7 @@ Linux system over a sample period:
  - the name of the process which initialized the timer
  - the function where the timer was initialized
  - the callback function which is associated to the timer
 -- the number of events (callbacks)
 +- the number of events (callbacks) executed per core
  
  timer_stats adds an entry to /proc: /proc/timer_stats
  
 @@ -45,23 +45,28 @@ readouts.
  
  Sample output of /proc/timer_stats:
  
 -Timerstats sample period: 3.888770 s
 -  12, 0 swapper  hrtimer_stop_sched_tick (hrtimer_sched_tick)
 -  15, 1 swapper  hcd_submit_urb (rh_timer_func)
 -   4,   959 kedacschedule_timeout (process_timeout)
 -   1, 0 swapper  page_writeback_init (wb_timer_fn)
 -  28, 0 swapper  hrtimer_stop_sched_tick (hrtimer_sched_tick)
 -  22,  2948 IRQ 4tty_flip_buffer_push (delayed_work_timer_fn)
 -   3,  3100 bash schedule_timeout (process_timeout)
 -   1, 1 swapper  queue_delayed_work_on (delayed_work_timer_fn)
 -   1, 1 swapper  queue_delayed_work_on (delayed_work_timer_fn)
 -   1, 1 swapper  neigh_table_init_no_netlink 
 (neigh_periodic_timer)
 -   1,  2292 ip   __netdev_watchdog_up (dev_watchdog)
 -   1,23 events/1 do_cache_clean (delayed_work_timer_fn)
 -90 total events, 30.0 events/sec
 -
 -The first column is the number of events, the second column the pid, the 
 third
 -column is the name of the process. The forth column shows the function which
 +Timer Stats Version: v0.3
 +Sample period: 4.365 s
 +Collection: inactive
 + 0,   4366, 0 swapper/1tick_setup_sched_timer 
 (tick_sched_timer)
 +  4366,  0, 1 swapper/0tick_setup_sched_timer 
 (tick_sched_timer)
 + 0,  5,  1132 sshd sk_reset_timer (tcp_write_timer)
 + 0,  4,   515 apache2  schedule_hrtimeout_range_clock 
 (hrtimer_wakeup)
 +16,  0, 0 swapper/0usb_hcd_poll_rh_status 
 (rh_timer_func)
 +16,  0, 0 swapper/0usb_hcd_poll_rh_status 
 (rh_timer_func)
 +16,  0, 0 swapper/0usb_hcd_poll_rh_status 
 (rh_timer_func)
 +16,  0, 0 swapper/0usb_hcd_poll_rh_status 
 (rh_timer_func)
 +16,  0, 0 swapper/0usb_hcd_poll_rh_status 
 (rh_timer_func)
 + 0,  7, 0 swapper/1sk_reset_timer (tcp_delack_timer)
 + 3,  4, 0 swapper/0clocksource_watchdog 
 (clocksource_watchdog)
 +0D, 3D,13 kworker/1:0  queue_delayed_work_on 
 (delayed_work_timer_fn)
 +3D, 0D,22 kworker/0:1  queue_delayed_work_on 
 (delayed_work_timer_fn)
 + 1,  0,22 kworker/0:1  e1000_watchdog_task (e1000_watchdog)
 +8842 total events, 2025.658 events/sec
 +
 
 Is the number of events in decimal or hex?  If in decimal, what do
 0D and 3D mean?

It is in decimal notation. The 'D' means 'deferrable timer' and it is
already described in timer_stats.txt.

 +The first comma-separated columns represent the number of events (one
 +column per present core), the next column the pid, and the next one is the
 +name of the process. The last column shows the function which
  initialized the timer and in parenthesis the callback function which was
 
 parentheses

Well spotted. I'll fix it and send it again.

Thanks,
Daniel

-- 
Toshiba Corporate Software Engineering Center
Daniel SANGORRIN
E-mail:  daniel.sangor...@toshiba.co.jp
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce

2014-04-24 Thread Daniel Sangorrin


 I can't keep it as a separate patch and so would be required to merge
 it into my original patch..
 
 And the reason being: No patch is supposed to break things, otherwise
 git bisect wouldn't work smoothly.. And so git bisect would complain
 this issue after my patch and so I have to merge the fixes you gave into
 the original patch as its not yet merged.
 --
 To unsubscribe from this list: send the line unsubscribe cgroups in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ok, no problem then.

Thanks,
Daniel

-- 
Toshiba Corporate Software Engineering Center
Daniel SANGORRIN
E-mail:  daniel.sangor...@toshiba.co.jp
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2] timer_stats: add core information to event counters

2014-04-24 Thread Daniel Sangorrin

Add information specifying the CPU core where timer callbacks (events)
were executed to the output of /proc/timer_stats.

Signed-off-by: Daniel Sangorrin daniel.sangor...@toshiba.co.jp
Signed-off-by: Yoshitake Kobayashi yoshitake.kobaya...@toshiba.co.jp
---
 Documentation/timers/timer_stats.txt | 43 
 kernel/time/timer_stats.c| 20 +
 2 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/Documentation/timers/timer_stats.txt 
b/Documentation/timers/timer_stats.txt
index 8abd40b..aeb467f 100644
--- a/Documentation/timers/timer_stats.txt
+++ b/Documentation/timers/timer_stats.txt
@@ -21,7 +21,7 @@ Linux system over a sample period:
 - the name of the process which initialized the timer
 - the function where the timer was initialized
 - the callback function which is associated to the timer
-- the number of events (callbacks)
+- the number of events (callbacks) executed per core
 
 timer_stats adds an entry to /proc: /proc/timer_stats
 
@@ -45,24 +45,29 @@ readouts.
 
 Sample output of /proc/timer_stats:
 
-Timerstats sample period: 3.888770 s
-  12, 0 swapper  hrtimer_stop_sched_tick (hrtimer_sched_tick)
-  15, 1 swapper  hcd_submit_urb (rh_timer_func)
-   4,   959 kedacschedule_timeout (process_timeout)
-   1, 0 swapper  page_writeback_init (wb_timer_fn)
-  28, 0 swapper  hrtimer_stop_sched_tick (hrtimer_sched_tick)
-  22,  2948 IRQ 4tty_flip_buffer_push (delayed_work_timer_fn)
-   3,  3100 bash schedule_timeout (process_timeout)
-   1, 1 swapper  queue_delayed_work_on (delayed_work_timer_fn)
-   1, 1 swapper  queue_delayed_work_on (delayed_work_timer_fn)
-   1, 1 swapper  neigh_table_init_no_netlink (neigh_periodic_timer)
-   1,  2292 ip   __netdev_watchdog_up (dev_watchdog)
-   1,23 events/1 do_cache_clean (delayed_work_timer_fn)
-90 total events, 30.0 events/sec
-
-The first column is the number of events, the second column the pid, the third
-column is the name of the process. The forth column shows the function which
-initialized the timer and in parenthesis the callback function which was
+Timer Stats Version: v0.3
+Sample period: 4.365 s
+Collection: inactive
+ 0,   4366, 0 swapper/1tick_setup_sched_timer 
(tick_sched_timer)
+  4366,  0, 1 swapper/0tick_setup_sched_timer 
(tick_sched_timer)
+ 0,  5,  1132 sshd sk_reset_timer (tcp_write_timer)
+ 0,  4,   515 apache2  schedule_hrtimeout_range_clock 
(hrtimer_wakeup)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+16,  0, 0 swapper/0usb_hcd_poll_rh_status (rh_timer_func)
+ 0,  7, 0 swapper/1sk_reset_timer (tcp_delack_timer)
+ 3,  4, 0 swapper/0clocksource_watchdog 
(clocksource_watchdog)
+0D, 3D,13 kworker/1:0  queue_delayed_work_on 
(delayed_work_timer_fn)
+3D, 0D,22 kworker/0:1  queue_delayed_work_on 
(delayed_work_timer_fn)
+ 1,  0,22 kworker/0:1  e1000_watchdog_task (e1000_watchdog)
+8842 total events, 2025.658 events/sec
+
+The first comma-separated columns represent the number of events (one
+column per present core), the next column the pid, and the next one is the
+name of the process. The last column shows the function which
+initialized the timer and in parentheses the callback function which was
 executed on expiry.
 
 Thomas, Ingo
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1fb08f2..53d1544 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -67,7 +67,7 @@ struct entry {
/*
 * Number of timeout events:
 */
-   unsigned long   count;
+   unsigned long   count[NR_CPUS];
unsigned inttimer_flag;
 
/*
@@ -203,7 +203,7 @@ static struct entry *tstat_lookup(struct entry *entry, char 
*comm)
curr = alloc_entry();
if (curr) {
*curr = *entry;
-   curr-count = 0;
+   memset(curr-count, 0, sizeof(curr-count));
curr-next = NULL;
memcpy(curr-comm, comm, TASK_COMM_LEN);
 
@@ -259,7 +259,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void 
*startf,
 
entry = tstat_lookup(input, comm);
if (likely(entry))
-   entry-count++;
+   entry-count[raw_smp_processor_id()]++;
else
atomic_inc(overflow_count);
 
@@ -284,7 +284,7 @@ static int tstats_show(struct seq_file *m, void *v)
unsigned long ms;
long events = 0

Re: [RFC 0/4] Migrate timers away from cpuset on setting cpuset.quiesce

2014-04-24 Thread Daniel Sangorrin



On 2014/04/25 13:51, Viresh Kumar wrote:
 On 25 April 2014 06:01, Daniel Sangorrin daniel.sangor...@toshiba.co.jp 
 wrote:

 I can't keep it as a separate patch and so would be required to merge
 it into my original patch..

 And the reason being: No patch is supposed to break things, otherwise
 git bisect wouldn't work smoothly.. And so git bisect would complain
 this issue after my patch and so I have to merge the fixes you gave into
 the original patch as its not yet merged.
 --
 To unsubscribe from this list: send the line unsubscribe cgroups in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

 Ok, no problem then.
 
 Do you want me to add your Tested-by for my next version ?
 
 

Sure.

Tested-by: Daniel Sangorrin daniel.sangor...@toshiba.co.jp

-- 
Toshiba Corporate Software Engineering Center
Daniel SANGORRIN
E-mail:  daniel.sangor...@toshiba.co.jp
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

50 matches

Mail list logo