date:20140314

[PATCH_v4 0/2] arm64: Add seccomp support

2014-03-14 Thread AKASHI Takahiro

(Please apply this patch after my ftrace patch and audit patch in order
to avoid some conflict on arm64/Kconfig.)

This patch enables secure computing (system call filtering) on arm64.
System calls can be allowed or denied by loaded bpf-style rules.
Architecture specific part is to run secure_computing() on syscall entry
and check the result. See [2/2]

Prerequisites are:
 * "arm64: make a single hook to syscall_trace() for all syscall features" patch
 * "arm64: split syscall_trace() into separate functions for enter/exit" patch
 * "arm64: Add audit support" patch
 * "arm64: is_compat_task is defined both in asm/compat.h and 
linux/compat.h" patch

This code is tested on ARMv8 fast model using libseccomp v2.1.1 with
modifications for arm64 and verified by its "live" tests, 20, 21 and 24.

Changes v3 -> v4:
* removed the following patch and moved it to "arm64: prerequisites for
  audit and ftrace" patchset since it is required for audit and ftrace in
  case of !COMPAT, too.
  "arm64: is_compat_task is defined both in asm/compat.h and linux/compat.h"

Changes v2 -> v3:
* removed unnecessary 'type cast' operations [2/3]
* check for a return value (-1) of secure_computing() explicitly [2/3]
* aligned with the patch, "arm64: split syscall_trace() into separate
  functions for enter/exit" [2/3]
* changed default of CONFIG_SECCOMP to n [2/3]

Changes v1 -> v2:
* added generic seccomp.h for arm64 to utilize it [1,2/3] 
* changed syscall_trace() to return more meaningful value (-EPERM)
  on seccomp failure case [2/3]
* aligned with the change in "arm64: make a single hook to syscall_trace()
  for all syscall features" v2 [2/3]
* removed is_compat_task() definition from compat.h [3/3]

AKASHI Takahiro (2):
  asm-generic: Add generic seccomp.h for secure computing mode 1
  arm64: Add seccomp support

 arch/arm64/Kconfig   | 14 ++
 arch/arm64/include/asm/seccomp.h | 25 +
 arch/arm64/include/asm/unistd.h  |  3 +++
 arch/arm64/kernel/entry.S|  4 
 arch/arm64/kernel/ptrace.c   |  6 ++
 include/asm-generic/seccomp.h| 28 
 6 files changed, 80 insertions(+)
 create mode 100644 arch/arm64/include/asm/seccomp.h
 create mode 100644 include/asm-generic/seccomp.h

-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH_v4 2/2] arm64: Add seccomp support

2014-03-14 Thread AKASHI Takahiro

secure_computing() should always be called first in syscall_trace_enter().
If it returns non-zero, we should stop further handling. Then that system
call may eventually fail, be trapped or the process itself be killed
depending on loaded rules.
In this case, syscall_trace_enter() returns a dedicated value in order to
skip a normal syscall table lookup because a seccomp rule may have already
overridden errno.

Signed-off-by: AKASHI Takahiro 
---
 arch/arm64/Kconfig   | 14 ++
 arch/arm64/include/asm/seccomp.h | 25 +
 arch/arm64/include/asm/unistd.h  |  3 +++
 arch/arm64/kernel/entry.S|  4 
 arch/arm64/kernel/ptrace.c   |  6 ++
 5 files changed, 52 insertions(+)
 create mode 100644 arch/arm64/include/asm/seccomp.h

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7c1f8c7..d5167d6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -28,6 +28,7 @@ config ARM64
select HARDIRQS_SW_RESEND
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_JUMP_LABEL
+   select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
select HAVE_C_RECORDMCOUNT
select HAVE_DEBUG_BUGVERBOSE
@@ -230,6 +231,19 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE
 
 source "mm/Kconfig"
 
+config SECCOMP
+   bool "Enable seccomp to safely compute untrusted bytecode"
+   ---help---
+ This kernel feature is useful for number crunching applications
+ that may need to compute untrusted bytecode during their
+ execution. By using pipes or other transports made available to
+ the process as file descriptors supporting the read/write
+ syscalls, it's possible to isolate those applications in
+ their own address space using seccomp. Once seccomp is
+ enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
+ and the task is only allowed to execute a few safe syscalls
+ defined by each seccomp mode.
+
 config XEN_DOM0
def_bool y
depends on XEN
diff --git a/arch/arm64/include/asm/seccomp.h b/arch/arm64/include/asm/seccomp.h
new file mode 100644
index 000..c76fac9
--- /dev/null
+++ b/arch/arm64/include/asm/seccomp.h
@@ -0,0 +1,25 @@
+/*
+ * arch/arm64/include/asm/seccomp.h
+ *
+ * Copyright (C) 2014 Linaro Limited
+ * Author: AKASHI Takahiro 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_SECCOMP_H
+#define _ASM_SECCOMP_H
+
+#include 
+
+#ifdef CONFIG_COMPAT
+#define __NR_seccomp_read_32   __NR_compat_read
+#define __NR_seccomp_write_32  __NR_compat_write
+#define __NR_seccomp_exit_32   __NR_compat_exit
+#define __NR_seccomp_sigreturn_32  __NR_compat_rt_sigreturn
+#endif /* CONFIG_COMPAT */
+
+#include 
+
+#endif /* _ASM_SECCOMP_H */
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 04f5d22..28bf882 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -30,6 +30,9 @@
  * Compat syscall numbers used by the AArch64 kernel.
  */
 #define __NR_compat_restart_syscall0
+#define __NR_compat_exit   1
+#define __NR_compat_read   3
+#define __NR_compat_write  4
 #define __NR_compat_sigreturn  119
 #define __NR_compat_rt_sigreturn   173
 
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index f06ee35..6ef266a 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -650,6 +650,10 @@ ENDPROC(el0_svc)
 __sys_trace:
mov x0, sp
bl  syscall_trace_enter
+#ifdef CONFIG_SECCOMP
+   cmp w0, #-EPERM // check seccomp result
+   b.eqret_to_user // -EPERM means 'rejected'
+#endif
adr lr, __sys_trace_return  // return address
uxtwscno, w0// syscall number (possibly new)
mov x1, sp  // pointer to regs
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index f9e1339..bb89fa3 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -21,12 +21,14 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1093,6 +1095,10 @@ static void tracehook_report_syscall(struct pt_regs 
*regs,
 
 asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 {
+   if (secure_computing(regs->syscallno) == -1)
+   /* seccomp failures shouldn't expose any additional code. */
+   return -EPERM;
+
if (test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
 
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a

[PATCH_v4 1/2] asm-generic: Add generic seccomp.h for secure computing mode 1

2014-03-14 Thread AKASHI Takahiro

Those values (__NR_seccomp_*) are used solely in secure_computing()
to identify mode 1 system calls. If compat system calls have different
syscall numbers, asm/seccomp.h may override them.

Acked-by: Arnd Bergmann 
Signed-off-by: AKASHI Takahiro 
---
 include/asm-generic/seccomp.h | 28 
 1 file changed, 28 insertions(+)
 create mode 100644 include/asm-generic/seccomp.h

diff --git a/include/asm-generic/seccomp.h b/include/asm-generic/seccomp.h
new file mode 100644
index 000..5e97022
--- /dev/null
+++ b/include/asm-generic/seccomp.h
@@ -0,0 +1,28 @@
+/*
+ * include/asm-generic/seccomp.h
+ *
+ * Copyright (C) 2014 Linaro Limited
+ * Author: AKASHI Takahiro 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_GENERIC_SECCOMP_H
+#define _ASM_GENERIC_SECCOMP_H
+
+#include 
+
+#if defined(CONFIG_COMPAT) && !defined(__NR_seccomp_read_32)
+#define __NR_seccomp_read_32   __NR_read
+#define __NR_seccomp_write_32  __NR_write
+#define __NR_seccomp_exit_32   __NR_exit
+#define __NR_seccomp_sigreturn_32  __NR_rt_sigreturn
+#endif /* CONFIG_COMPAT && ! already defined */
+
+#define __NR_seccomp_read  __NR_read
+#define __NR_seccomp_write __NR_write
+#define __NR_seccomp_exit  __NR_exit
+#define __NR_seccomp_sigreturn __NR_rt_sigreturn
+
+#endif /* _ASM_GENERIC_SECCOMP_H */
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH_v8 0/2] arm64: Add audit support

2014-03-14 Thread AKASHI Takahiro

(Please apply this patch after my ftrace patch to resolve some conflict
on arm64/kernel/ptrace.c, functionally it doesn't depend on ftrace though)

This patchset adds system call audit support on arm64.
Both 32-bit (AUDIT_ARCH_ARM) and 64-bit tasks (AUDIT_ARCH_AARCH64)
are supported. Since arm64 has the exact same set of system calls
on LE and BE, we don't care about endianness (or more specifically
__AUDIT_ARCH_64BIT bit in AUDIT_ARCH_*).

There are some prerequisites for this patch to work correctly:
* "audit: Add CONFIG_HAVE_ARCH_AUDITSYSCALL" patch
* "audit: generic compat system call audit support" patch
* "arm64: __NR_* definitions for compat syscalls" patch from Catalin
* "arm64: make a single hook to syscall_trace() for all syscall features" patch
* "arm64: split syscall_trace() into separate functions for enter/exit" patch
* "arm64: Add regs_return_value() in syscall.h" patch
* "arm64: is_compat_task is defined both in asm/compat.h and 
   linux/compat.h" patch
* userspace audit tool (v2.3.2 + my patch for arm64)

Please review them as well for better understandings.

This code was tested on both 32-bit and 64-bit LE userland 
in the following two ways:
1) basic operations with auditctl/autrace
  # auditctl -a exit,always -S openat -F path=/etc/inittab
  # auditctl -a exit,always -F dir=/tmp -F perm=rw
  # auditctl -a task,always
  # autrace /bin/ls
by comparing output from autrace with one from strace

2) audit-test-code (+ my workarounds for arm/arm64)
  by running "audit-tool", "filter" and "syscalls" test categories.

Changes v7 -> v8:
* aligned with the change in "audit: generic compat system call audit
  support" v5 [1/2]
* aligned with the change in "arm64: split syscall_trace() into separate
  functions for enter/exit" v5 [2/2]

Changes v6 -> v7:
* changed an include file in syscall.h from  to
   [1/2]
* aligned with the patch, "arm64: split syscall_trace() into separate
  functions for enter/exit" [2/2]

Changes v5 -> v6:
* removed and put "arm64: Add regs_return_value() in syscall.h" patch into
  a separate set
* aligned with the change in "arm64: make a single hook to syscall_trace()
  for all syscall features" v3 [1/2]

Changes v4 -> v5:
* rebased to 3.14-rcX
* added a guard against TIF_SYSCALL_AUDIT [3/3]
* aligned with the change in "arm64: make a single hook to syscall_trace()
  for all syscall features" v2 [3/3]

Changes v3 -> v4:
* Modified to sync with the patch, "make a single hook to syscall_trace()
  for all syscall features"
* aligned with "audit: Add CONFIG_HAVE_ARCH_AUDITSYSCALL" patch

Changes v2 -> v3:
* Remove asm/audit.h.
  See "generic compat syscall audit support" patch v4
* Remove endianness dependency, ie. AUDIT_ARCH_ARMEB/AARCH64EB.
* Remove kernel/syscalls/Makefile which was used to create unistd32.h.
  See Catalin's "Add __NR_* definitions for compat syscalls" patch

Changes v1 -> v2:
* Modified to utilize "generic compat system call audit" [3/6, 4/6, 5/6]
  Please note that a required header, unistd_32.h, is automatically
  generated from unistd32.h.
* Refer to regs->orig_x0 instead of regs->x0 as the first argument of
  system call in audit_syscall_entry() [6/6]
* Include "Add regs_return_value() in syscall.h" patch [2/6],
  which was not intentionally included in v1 because it could be added
  by "kprobes support".

AKASHI Takahiro (2):
  arm64: Add audit support
  arm64: audit: Add audit hook in syscall_trace_enter/exit()

 arch/arm64/Kconfig   |  2 ++
 arch/arm64/include/asm/syscall.h | 15 +++
 arch/arm64/kernel/ptrace.c   |  7 +++
 include/uapi/linux/audit.h   |  1 +
 4 files changed, 25 insertions(+)

-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH_v8 1/2] arm64: Add audit support

2014-03-14 Thread AKASHI Takahiro

On AArch64, audit is supported through generic lib/audit.c and
compat_audit.c, and so this patch adds arch specific definitions required.

Acked-by Will Deacon 
Acked-by: Richard Guy Briggs 
Signed-off-by: AKASHI Takahiro 
---
 arch/arm64/Kconfig   |  2 ++
 arch/arm64/include/asm/syscall.h | 15 +++
 include/uapi/linux/audit.h   |  1 +
 3 files changed, 18 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b1dcdb4..7c1f8c7 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -9,6 +9,7 @@ config ARM64
select ARM_AMBA
select ARM_ARCH_TIMER
select ARM_GIC
+   select AUDIT_ARCH_COMPAT_GENERIC
select BUILDTIME_EXTABLE_SORT
select CLONE_BACKWARDS
select COMMON_CLK
@@ -25,6 +26,7 @@ config ARM64
select GENERIC_STRNLEN_USER
select GENERIC_TIME_VSYSCALL
select HARDIRQS_SW_RESEND
+   select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_TRACEHOOK
select HAVE_C_RECORDMCOUNT
diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h
index 383771e..ce3882f 100644
--- a/arch/arm64/include/asm/syscall.h
+++ b/arch/arm64/include/asm/syscall.h
@@ -16,7 +16,9 @@
 #ifndef __ASM_SYSCALL_H
 #define __ASM_SYSCALL_H
 
+#include 
 #include 
+#include 
 
 extern const void *sys_call_table[];
 
@@ -105,4 +107,17 @@ static inline void syscall_set_arguments(struct 
task_struct *task,
memcpy(>regs[i], args, n * sizeof(args[0]));
 }
 
+/*
+ * We don't care about endianness (__AUDIT_ARCH_LE bit) here because
+ * AArch64 has the same system calls both on little- and big- endian.
+ */
+static inline int syscall_get_arch(struct task_struct *task,
+  struct pt_regs *regs)
+{
+   if (is_compat_thread(task_thread_info(task)))
+   return AUDIT_ARCH_ARM;
+
+   return AUDIT_ARCH_AARCH64;
+}
+
 #endif /* __ASM_SYSCALL_H */
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 624df43..aa86fab 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -333,6 +333,7 @@ enum {
 /* distinguish syscall tables */
 #define __AUDIT_ARCH_64BIT 0x8000
 #define __AUDIT_ARCH_LE   0x4000
+#define AUDIT_ARCH_AARCH64 (EM_AARCH64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_ALPHA   (EM_ALPHA|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_ARM (EM_ARM|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_ARMEB   (EM_ARM)
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH_v8 2/2] arm64: audit: Add audit hook in syscall_trace_enter/exit()

2014-03-14 Thread AKASHI Takahiro

This patch adds auditing functions on entry to or exit from
every system call invocation.

Acked-by: Richard Guy Briggs 
Signed-off-by: AKASHI Takahiro 
---
 arch/arm64/kernel/ptrace.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 3ee76ed..f9e1339 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -19,6 +19,7 @@
  * along with this program.  If not, see .
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -39,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -1097,11 +1099,16 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
trace_sys_enter(regs, regs->syscallno);
 
+   audit_syscall_entry(syscall_get_arch(current, regs), regs->syscallno,
+   regs->orig_x0, regs->regs[1], regs->regs[2], regs->regs[3]);
+
return regs->syscallno;
 }
 
 asmlinkage void syscall_trace_exit(struct pt_regs *regs)
 {
+   audit_syscall_exit(regs);
+
if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
trace_sys_exit(regs, regs_return_value(regs));
 
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH_v5] audit: Add generic compat syscall support

2014-03-14 Thread AKASHI Takahiro

lib/audit.c provides a generic function for auditing system calls.
This patch extends it for compat syscall support on bi-architectures
(32/64-bit) by adding lib/compat_audit.c.
What is required to support this feature are:
 * add asm/unistd32.h for compat system call names
 * select CONFIG_AUDIT_ARCH_COMPAT_GENERIC

Signed-off-by: AKASHI Takahiro 
---
 include/linux/audit.h  |  8 
 include/uapi/linux/audit.h |  6 ++
 lib/Kconfig|  9 +
 lib/Makefile   |  1 +
 lib/audit.c| 15 +-
 lib/compat_audit.c | 50 ++
 6 files changed, 88 insertions(+), 1 deletion(-)
 create mode 100644 lib/compat_audit.c

diff --git a/include/linux/audit.h b/include/linux/audit.h
index aa865a9..85f0503 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -78,6 +78,14 @@ extern int is_audit_feature_set(int which);
 extern int __init audit_register_class(int class, unsigned *list);
 extern int audit_classify_syscall(int abi, unsigned syscall);
 extern int audit_classify_arch(int arch);
+/* only for compat system calls */
+extern unsigned compat_write_class[];
+extern unsigned compat_read_class[];
+extern unsigned compat_dir_class[];
+extern unsigned compat_chattr_class[];
+extern unsigned compat_signal_class[];
+
+extern int __weak audit_classify_compat_syscall(int abi, unsigned syscall);
 
 /* audit_names->type values */
 #defineAUDIT_TYPE_UNKNOWN  0   /* we don't know yet */
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 2d48fe1..624df43 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -361,6 +361,12 @@ enum {
 #define AUDIT_ARCH_SPARC64 (EM_SPARCV9|__AUDIT_ARCH_64BIT)
 #define AUDIT_ARCH_X86_64  (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 
+#ifdef CONFIG_COMPAT
+#define audit_is_compat(arch)  (!((arch) & __AUDIT_ARCH_64BIT))
+#else
+#define audit_is_compat(arch)  false
+#endif
+
 #define AUDIT_PERM_EXEC1
 #define AUDIT_PERM_WRITE   2
 #define AUDIT_PERM_READ4
diff --git a/lib/Kconfig b/lib/Kconfig
index 991c98b..1e80cb3 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -182,6 +182,15 @@ config AUDIT_GENERIC
depends on AUDIT && !AUDIT_ARCH
default y
 
+config AUDIT_ARCH_COMPAT_GENERIC
+   bool
+   default n
+
+config AUDIT_COMPAT_GENERIC
+   bool
+   depends on AUDIT_GENERIC && AUDIT_ARCH_COMPAT_GENERIC && COMPAT
+   default y
+
 config RANDOM32_SELFTEST
bool "PRNG perform self test on init"
default n
diff --git a/lib/Makefile b/lib/Makefile
index 48140e3..0cd7b68 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
 obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
 obj-$(CONFIG_SMP) += percpu_counter.o
 obj-$(CONFIG_AUDIT_GENERIC) += audit.o
+obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o
 
 obj-$(CONFIG_SWIOTLB) += swiotlb.o
 obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
diff --git a/lib/audit.c b/lib/audit.c
index 76bbed4..1d726a2 100644
--- a/lib/audit.c
+++ b/lib/audit.c
@@ -30,11 +30,17 @@ static unsigned signal_class[] = {
 
 int audit_classify_arch(int arch)
 {
-   return 0;
+   if (audit_is_compat(arch))
+   return 1;
+   else
+   return 0;
 }
 
 int audit_classify_syscall(int abi, unsigned syscall)
 {
+   if (audit_is_compat(abi))
+   return audit_classify_compat_syscall(abi, syscall);
+
switch(syscall) {
 #ifdef __NR_open
case __NR_open:
@@ -57,6 +63,13 @@ int audit_classify_syscall(int abi, unsigned syscall)
 
 static int __init audit_classes_init(void)
 {
+#ifdef CONFIG_AUDIT_COMPAT_GENERIC
+   audit_register_class(AUDIT_CLASS_WRITE_32, compat_write_class);
+   audit_register_class(AUDIT_CLASS_READ_32, compat_read_class);
+   audit_register_class(AUDIT_CLASS_DIR_WRITE_32, compat_dir_class);
+   audit_register_class(AUDIT_CLASS_CHATTR_32, compat_chattr_class);
+   audit_register_class(AUDIT_CLASS_SIGNAL_32, compat_signal_class);
+#endif
audit_register_class(AUDIT_CLASS_WRITE, write_class);
audit_register_class(AUDIT_CLASS_READ, read_class);
audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
diff --git a/lib/compat_audit.c b/lib/compat_audit.c
new file mode 100644
index 000..873f75b
--- /dev/null
+++ b/lib/compat_audit.c
@@ -0,0 +1,50 @@
+#include 
+#include 
+#include 
+
+unsigned compat_dir_class[] = {
+#include 
+~0U
+};
+
+unsigned compat_read_class[] = {
+#include 
+~0U
+};
+
+unsigned compat_write_class[] = {
+#include 
+~0U
+};
+
+unsigned compat_chattr_class[] = {
+#include 
+~0U
+};
+
+unsigned compat_signal_class[] = {
+#include 
+~0U
+};
+
+int audit_classify_compat_syscall(int abi, unsigned syscall)
+{
+   switch (syscall) {
+#ifdef __NR_open
+   case __NR_open:
+   return 2;
+#endif
+#ifdef __NR_openat
+   case

[PATCH_v5] audit: generic compat system call support

2014-03-14 Thread AKASHI Takahiro

Arm64 supports 32-bit mode(AArch32) and 64-bit mode(AArch64).
To enable audit on arm64, we want to use lib/audit.c and re-work it
to support compat system calls as well without copying it under
arch sub-directory.

Since this patch is implemented in much the same way as on existing
bi-architectures (ie. ppc, s390, sparc and x86), it's not difficult
for them to utilize this generic code instead of their own implementation.

The code was tested on armv8 fast model with 64-bit and 32-bit userland
by using modified audit-test-code. As this patch is mandatory for my
"system call audit support for arm64" patch, please review it as well
for better understandings.

Changes v4 -> v5:
* Add CONFIG_AUDIT_ARCH_COMPAT_GENERIC. Its purpose is to avoid compling
  compat_audit.c unintentionally. Some architectures may enable AUDIT,
  COMPAT and then AUDIT_GENERIC if !AUDIT_ARCH, but not really need
  compat_audit.c.
  (mips is the case, but mips doesn't really support AUDITSYSCALL though.)

Changes v3 -> v4:
* Add CONFIG_AUDIT_COMPAT_GENERIC to compile in compat_audit.c
* Re-define audit_is_compat() in generic way in order to eliminate
  necessity of asm/audit.h.

Changes v2 -> v3:
* Specify AUDIT_CLASS_XYZ_32 instead of AUDIT_CLASS_XYZ when registering
  compat syscalls (bug fix)

AKASHI Takahiro (1):
  audit: Add generic compat syscall support

 include/linux/audit.h  |  8 
 include/uapi/linux/audit.h |  6 ++
 lib/Kconfig|  9 +
 lib/Makefile   |  1 +
 lib/audit.c| 15 +-
 lib/compat_audit.c | 50 ++
 6 files changed, 88 insertions(+), 1 deletion(-)
 create mode 100644 lib/compat_audit.c

-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v7 7/7] arm64: ftrace: Add system call tracepoint

2014-03-14 Thread AKASHI Takahiro

This patch allows system call entry or exit to be traced as ftrace events,
ie. sys_enter_*/sys_exit_*, if CONFIG_FTRACE_SYSCALLS is enabled.
Those events appear and can be controlled under
${sysfs}/tracing/events/syscalls/

Please note that we can't trace compat system calls here because
AArch32 mode does not share the same syscall table with AArch64.
Just define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS in order to avoid unexpected
results (bogus syscalls reported or even hang-up).

Acked-by: Will Deacon 
Signed-off-by: AKASHI Takahiro 
---
 arch/arm64/Kconfig   |  1 +
 arch/arm64/include/asm/ftrace.h  | 18 ++
 arch/arm64/include/asm/syscall.h |  1 +
 arch/arm64/include/asm/unistd.h  |  2 ++
 arch/arm64/kernel/ptrace.c   |  9 +
 5 files changed, 31 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6954959..b1dcdb4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -43,6 +43,7 @@ config ARM64
select HAVE_MEMBLOCK
select HAVE_PATA_PLATFORM
select HAVE_PERF_EVENTS
+   select HAVE_SYSCALL_TRACEPOINTS
select IRQ_DOMAIN
select MODULES_USE_ELF_RELA
select NO_BOOTMEM
diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index c44c4b1..7616255 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -44,6 +44,24 @@ static inline unsigned long ftrace_call_adjust(unsigned long 
addr)
 #define CALLER_ADDR4 ((unsigned long)return_address(4))
 #define CALLER_ADDR5 ((unsigned long)return_address(5))
 #define CALLER_ADDR6 ((unsigned long)return_address(6))
+
+#include 
+
+/*
+ * Because AArch32 mode does not share the same syscall table with AArch64,
+ * tracing compat syscalls may result in reporting bogus syscalls or even
+ * hang-up, so just do not trace them.
+ * See kernel/trace/trace_syscalls.c
+ *
+ * x86 code says:
+ * If the user realy wants these, then they should use the
+ * raw syscall tracepoints with filtering.
+ */
+#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
+static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
+{
+   return is_compat_task();
+}
 #endif /* ifndef __ASSEMBLY__ */
 
 #endif /* __ASM_FTRACE_H */
diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h
index 70ba9d4..383771e 100644
--- a/arch/arm64/include/asm/syscall.h
+++ b/arch/arm64/include/asm/syscall.h
@@ -18,6 +18,7 @@
 
 #include 
 
+extern const void *sys_call_table[];
 
 static inline int syscall_get_nr(struct task_struct *task,
 struct pt_regs *regs)
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 82ce217..c335479 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -28,3 +28,5 @@
 #endif
 #define __ARCH_WANT_SYS_CLONE
 #include 
+
+#define NR_syscalls (__NR_syscalls)
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index c47a3ed..3ee76ed 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -42,6 +42,9 @@
 #include 
 #include 
 
+#define CREATE_TRACE_POINTS
+#include 
+
 /*
  * TODO: does not yet catch signals sent when the child dies.
  * in exit.c or in signal.c.
@@ -1091,11 +1094,17 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
if (test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
 
+   if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+   trace_sys_enter(regs, regs->syscallno);
+
return regs->syscallno;
 }
 
 asmlinkage void syscall_trace_exit(struct pt_regs *regs)
 {
+   if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+   trace_sys_exit(regs, regs_return_value(regs));
+
if (test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT);
 }
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v7 6/7] arm64: ftrace: Add CALLER_ADDRx macros

2014-03-14 Thread AKASHI Takahiro

CALLER_ADDRx returns caller's address at specified level in call stacks.
They are used for several tracers like irqsoff and preemptoff.
Strange to say, however, they are refered even without FTRACE.

Signed-off-by: AKASHI Takahiro 
---
 arch/arm64/include/asm/ftrace.h| 13 -
 arch/arm64/kernel/Makefile |  3 ++-
 arch/arm64/kernel/return_address.c | 55 ++
 3 files changed, 69 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm64/kernel/return_address.c

diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index ed5c448..c44c4b1 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -18,6 +18,7 @@
 
 #ifndef __ASSEMBLY__
 extern void _mcount(unsigned long);
+extern void *return_address(unsigned int);
 
 struct dyn_arch_ftrace {
/* No extra data needed for arm64 */
@@ -33,6 +34,16 @@ static inline unsigned long ftrace_call_adjust(unsigned long 
addr)
 */
return addr;
 }
-#endif /* __ASSEMBLY__ */
+
+#define HAVE_ARCH_CALLER_ADDR
+
+#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#define CALLER_ADDR1 ((unsigned long)return_address(1))
+#define CALLER_ADDR2 ((unsigned long)return_address(2))
+#define CALLER_ADDR3 ((unsigned long)return_address(3))
+#define CALLER_ADDR4 ((unsigned long)return_address(4))
+#define CALLER_ADDR5 ((unsigned long)return_address(5))
+#define CALLER_ADDR6 ((unsigned long)return_address(6))
+#endif /* ifndef __ASSEMBLY__ */
 
 #endif /* __ASM_FTRACE_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index ac67fd0..b5bfa7f 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -7,12 +7,13 @@ AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET)
 
 CFLAGS_REMOVE_ftrace.o = -pg
 CFLAGS_REMOVE_insn.o = -pg
+CFLAGS_REMOVE_return_address.o = -pg
 
 # Object file lists.
 arm64-obj-y:= cputable.o debug-monitors.o entry.o irq.o fpsimd.o   
\
   entry-fpsimd.o process.o ptrace.o setup.o signal.o   
\
   sys.o stacktrace.o time.o traps.o io.o vdso.o
\
-  hyp-stub.o psci.o cpu_ops.o insn.o
+  hyp-stub.o psci.o cpu_ops.o insn.o return_address.o
 
 arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o 
\
   sys_compat.o
diff --git a/arch/arm64/kernel/return_address.c 
b/arch/arm64/kernel/return_address.c
new file mode 100644
index 000..89102a6
--- /dev/null
+++ b/arch/arm64/kernel/return_address.c
@@ -0,0 +1,55 @@
+/*
+ * arch/arm64/kernel/return_address.c
+ *
+ * Copyright (C) 2013 Linaro Limited
+ * Author: AKASHI Takahiro 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include 
+#include 
+
+#include 
+
+struct return_address_data {
+   unsigned int level;
+   void *addr;
+};
+
+static int save_return_addr(struct stackframe *frame, void *d)
+{
+   struct return_address_data *data = d;
+
+   if (!data->level) {
+   data->addr = (void *)frame->pc;
+   return 1;
+   } else {
+   --data->level;
+   return 0;
+   }
+}
+
+void *return_address(unsigned int level)
+{
+   struct return_address_data data;
+   struct stackframe frame;
+   register unsigned long current_sp asm ("sp");
+
+   data.level = level + 2;
+   data.addr = NULL;
+
+   frame.fp = (unsigned long)__builtin_frame_address(0);
+   frame.sp = current_sp;
+   frame.pc = (unsigned long)return_address; /* dummy */
+
+   walk_stackframe(, save_return_addr, );
+
+   if (!data.level)
+   return data.addr;
+   else
+   return NULL;
+}
+EXPORT_SYMBOL_GPL(return_address);
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v7 5/7] arm64: ftrace: Add dynamic ftrace support

2014-03-14 Thread AKASHI Takahiro

This patch allows "dynamic ftrace" if CONFIG_DYNAMIC_FTRACE is enabled.
Here we can turn on and off tracing dynamically per-function base.

On arm64, this is done by patching single branch instruction to _mcount()
inserted by gcc -pg option. The branch is replaced to NOP initially at
kernel start up, and later on, NOP to branch to ftrace_caller() when
enabled or branch to NOP when disabled.
Please note that ftrace_caller() is a counterpart of _mcount() in case of
'static' ftrace.

More details on architecture specific requirements are described in
Documentation/trace/ftrace-design.txt.

Signed-off-by: AKASHI Takahiro 
---
 arch/arm64/Kconfig   |   1 +
 arch/arm64/include/asm/ftrace.h  |  15 ++
 arch/arm64/kernel/entry-ftrace.S |  43 +++
 arch/arm64/kernel/ftrace.c   | 114 +++
 4 files changed, 173 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6b3fef6..6954959 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -33,6 +33,7 @@ config ARM64
select HAVE_DMA_API_DEBUG
select HAVE_DMA_ATTRS
select HAVE_DMA_CONTIGUOUS
+   select HAVE_DYNAMIC_FTRACE
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_TRACER
diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index 58ea595..ed5c448 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -18,6 +18,21 @@
 
 #ifndef __ASSEMBLY__
 extern void _mcount(unsigned long);
+
+struct dyn_arch_ftrace {
+   /* No extra data needed for arm64 */
+};
+
+extern unsigned long ftrace_graph_call;
+
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+   /*
+* addr is the address of the mcount call instruction.
+* recordmcount does the necessary offset calculation.
+*/
+   return addr;
+}
 #endif /* __ASSEMBLY__ */
 
 #endif /* __ASM_FTRACE_H */
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index 622846f..d0cad6d 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -86,6 +86,7 @@
add \reg, \reg, #8
.endm
 
+#ifndef CONFIG_DYNAMIC_FTRACE
 /*
  * void _mcount(unsigned long return_address)
  * @return_address: return address to instrumented function
@@ -134,6 +135,48 @@ skip_ftrace_call:
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 ENDPROC(_mcount)
 
+#else /* CONFIG_DYNAMIC_FTRACE */
+/*
+ * _mcount() is used to build the kernel with -pg option, but all the branch
+ * instructions to _mcount() are replaced to NOP initially at kernel start up,
+ * and later on, NOP to branch to ftrace_caller() when enabled or branch to
+ * NOP when disabled per-function base.
+ */
+ENTRY(_mcount)
+   ret
+ENDPROC(_mcount)
+
+/*
+ * void ftrace_caller(unsigned long return_address)
+ * @return_address: return address to instrumented function
+ *
+ * This function is a counterpart of _mcount() in 'static' ftrace, and
+ * makes calls to:
+ * - tracer function to probe instrumented function's entry,
+ * - ftrace_graph_caller to set up an exit hook
+ */
+ENTRY(ftrace_caller)
+   mcount_enter
+
+   mcount_get_pc0  x0  // function's pc
+   mcount_get_lr   x1  // function's lr
+
+   .global ftrace_call
+ftrace_call:   // tracer(pc, lr);
+   nop // This will be replaced with "bl xxx"
+   // where xxx can be any kind of tracer.
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+   .global ftrace_graph_call
+ftrace_graph_call: // ftrace_graph_caller();
+   nop // If enabled, this will be replaced
+   // "b ftrace_graph_caller"
+#endif
+
+   mcount_exit
+ENDPROC(ftrace_caller)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
 ENTRY(ftrace_stub)
ret
 ENDPROC(ftrace_stub)
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index a559ab8..9f708e7 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -17,6 +17,89 @@
 #include 
 #include 
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+/*
+ * Replace a single instruction, which may be a branch or NOP.
+ * If @validate == true, a replaced instruction is checked against 'old'.
+ */
+static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
+ bool validate)
+{
+   u32 replaced;
+
+   /*
+* Note:
+* Due to modules and __init, code can disappear and change,
+* we need to protect against faulting as well as code changing.
+* We do this by aarch64_insn_*() which use the probe_kernel_*().
+*
+* No lock is held here because all the modifications are run
+* through stop_machine().
+*/
+   if (validate) {
+   if

[PATCH v7 4/7] arm64: Add ftrace support

2014-03-14 Thread AKASHI Takahiro

This patch implements arm64 specific part to support function tracers,
such as function (CONFIG_FUNCTION_TRACER), function_graph
(CONFIG_FUNCTION_GRAPH_TRACER) and function profiler
(CONFIG_FUNCTION_PROFILER).

With 'function' tracer, all the functions in the kernel are traced with
timestamps in ${sysfs}/tracing/trace. If function_graph tracer is
specified, call graph is generated.

The kernel must be compiled with -pg option so that _mcount() is inserted
at the beginning of functions. This function is called on every function's
entry as long as tracing is enabled.
In addition, function_graph tracer also needs to be able to probe function's
exit. ftrace_graph_caller() & return_to_handler do this by faking link
register's value to intercept function's return path.

More details on architecture specific requirements are described in
Documentation/trace/ftrace-design.txt.

Reviewed-by: Ganapatrao Kulkarni 
Signed-off-by: AKASHI Takahiro 
---
 arch/arm64/Kconfig   |   2 +
 arch/arm64/include/asm/ftrace.h  |  23 +
 arch/arm64/kernel/Makefile   |   4 +
 arch/arm64/kernel/arm64ksyms.c   |   4 +
 arch/arm64/kernel/entry-ftrace.S | 175 +++
 arch/arm64/kernel/ftrace.c   |  64 ++
 6 files changed, 272 insertions(+)
 create mode 100644 arch/arm64/include/asm/ftrace.h
 create mode 100644 arch/arm64/kernel/entry-ftrace.S
 create mode 100644 arch/arm64/kernel/ftrace.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 340e344..6b3fef6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -35,6 +35,8 @@ config ARM64
select HAVE_DMA_CONTIGUOUS
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_FTRACE_MCOUNT_RECORD
+   select HAVE_FUNCTION_TRACER
+   select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_GENERIC_DMA_COHERENT
select HAVE_HW_BREAKPOINT if PERF_EVENTS
select HAVE_MEMBLOCK
diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
new file mode 100644
index 000..58ea595
--- /dev/null
+++ b/arch/arm64/include/asm/ftrace.h
@@ -0,0 +1,23 @@
+/*
+ * arch/arm64/include/asm/ftrace.h
+ *
+ * Copyright (C) 2013 Linaro Limited
+ * Author: AKASHI Takahiro 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_FTRACE_H
+#define __ASM_FTRACE_H
+
+#include 
+
+#define MCOUNT_ADDR((unsigned long)_mcount)
+#define MCOUNT_INSN_SIZE   AARCH64_INSN_SIZE
+
+#ifndef __ASSEMBLY__
+extern void _mcount(unsigned long);
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_FTRACE_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 2d4554b..ac67fd0 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -5,6 +5,9 @@
 CPPFLAGS_vmlinux.lds   := -DTEXT_OFFSET=$(TEXT_OFFSET)
 AFLAGS_head.o  := -DTEXT_OFFSET=$(TEXT_OFFSET)
 
+CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_insn.o = -pg
+
 # Object file lists.
 arm64-obj-y:= cputable.o debug-monitors.o entry.o irq.o fpsimd.o   
\
   entry-fpsimd.o process.o ptrace.o setup.o signal.o   
\
@@ -13,6 +16,7 @@ arm64-obj-y   := cputable.o debug-monitors.o entry.o 
irq.o fpsimd.o   \
 
 arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o 
\
   sys_compat.o
+arm64-obj-$(CONFIG_FUNCTION_TRACER)+= ftrace.o entry-ftrace.o
 arm64-obj-$(CONFIG_MODULES)+= arm64ksyms.o module.o
 arm64-obj-$(CONFIG_SMP)+= smp.o smp_spin_table.o
 arm64-obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 338b568..7f0512f 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -56,3 +56,7 @@ EXPORT_SYMBOL(clear_bit);
 EXPORT_SYMBOL(test_and_clear_bit);
 EXPORT_SYMBOL(change_bit);
 EXPORT_SYMBOL(test_and_change_bit);
+
+#ifdef CONFIG_FUNCTION_TRACER
+EXPORT_SYMBOL(_mcount);
+#endif
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
new file mode 100644
index 000..622846f
--- /dev/null
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -0,0 +1,175 @@
+/*
+ * arch/arm64/kernel/entry-ftrace.S
+ *
+ * Copyright (C) 2013 Linaro Limited
+ * Author: AKASHI Takahiro 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+
+/*
+ * Gcc with -pg will put the following code in the beginning of each function:
+ *  mov x0, x30
+ *  bl _mcount
+ * [function's body ...]
+ * "bl _mcount" may be replaced to "bl ftrace_caller" or NOP if dynamic
+ * ftrace is enabled.
+ *
+ * Please note that x0 as an argument

[PATCH v5 1/4] arm64: make a single hook to syscall_trace() for all syscall features

2014-03-14 Thread AKASHI Takahiro

Currently syscall_trace() is called only for ptrace.
With additional TIF_xx flags defined, it is now called in all the cases
of audit, ftrace and seccomp in addition to ptrace.

Acked-by: Richard Guy Briggs 
Acked-by: Will Deacon 
Signed-off-by: AKASHI Takahiro 
---
 arch/arm64/include/asm/thread_info.h | 13 +
 arch/arm64/kernel/entry.S|  5 +++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/thread_info.h 
b/arch/arm64/include/asm/thread_info.h
index 720e70b..0a8b2a9 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -91,6 +91,9 @@ static inline struct thread_info *current_thread_info(void)
 /*
  * thread information flags:
  *  TIF_SYSCALL_TRACE  - syscall trace active
+ *  TIF_SYSCALL_TRACEPOINT - syscall tracepoint for ftrace
+ *  TIF_SYSCALL_AUDIT  - syscall auditing
+ *  TIF_SECOMP - syscall secure computing
  *  TIF_SIGPENDING - signal pending
  *  TIF_NEED_RESCHED   - rescheduling necessary
  *  TIF_NOTIFY_RESUME  - callback before returning to user
@@ -101,6 +104,9 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_NEED_RESCHED   1
 #define TIF_NOTIFY_RESUME  2   /* callback before returning to user */
 #define TIF_SYSCALL_TRACE  8
+#define TIF_SYSCALL_AUDIT  9
+#define TIF_SYSCALL_TRACEPOINT 10
+#define TIF_SECCOMP11
 #define TIF_POLLING_NRFLAG 16
 #define TIF_MEMDIE 18  /* is terminating due to OOM killer */
 #define TIF_FREEZE 19
@@ -112,10 +118,17 @@ static inline struct thread_info 
*current_thread_info(void)
 #define _TIF_SIGPENDING(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED  (1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
+#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
+#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
+#define _TIF_SYSCALL_TRACEPOINT(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_SECCOMP   (1 << TIF_SECCOMP)
 #define _TIF_32BIT (1 << TIF_32BIT)
 
 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 _TIF_NOTIFY_RESUME)
 
+#define _TIF_SYSCALL_WORK  (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
+_TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP)
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_THREAD_INFO_H */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 39ac630..f9f2cae 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -631,8 +631,9 @@ el0_svc_naked:  // 
compat entry point
enable_irq
 
get_thread_info tsk
-   ldr x16, [tsk, #TI_FLAGS]   // check for syscall tracing
-   tbnzx16, #TIF_SYSCALL_TRACE, __sys_trace // are we tracing syscalls?
+   ldr x16, [tsk, #TI_FLAGS]   // check for syscall hooks
+   tst x16, #_TIF_SYSCALL_WORK
+   b.ne__sys_trace
adr lr, ret_fast_syscall// return address
cmp scno, sc_nr // check upper syscall limit
b.hsni_sys
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v7 3/7] ftrace: Add arm64 support to recordmcount

2014-03-14 Thread AKASHI Takahiro

Recordmcount utility under scripts is run, after compiling each object,
to find out all the locations of calling _mcount() and put them into
specific seciton named __mcount_loc.
Then linker collects all such information into a table in the kernel image
(between __start_mcount_loc and __stop_mcount_loc) for later use by ftrace.

This patch adds arm64 specific definitions to identify such locations.
There are two types of implementation, C and Perl. On arm64, only C version
is used to build the kernel now that CONFIG_HAVE_C_RECORDMCOUNT is on.
But Perl version is also maintained.

This patch also contains a workaround just in case where a header file,
elf.h, on host machine doesn't have definitions of EM_AARCH64 nor
R_AARCH64_ABS64. Without them, compiling C version of recordmcount will
fail.

Signed-off-by: AKASHI Takahiro 
---
 arch/arm64/Kconfig  | 2 ++
 scripts/recordmcount.c  | 7 +++
 scripts/recordmcount.pl | 5 +
 3 files changed, 14 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 27bbcfc..340e344 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -27,12 +27,14 @@ config ARM64
select HARDIRQS_SW_RESEND
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_TRACEHOOK
+   select HAVE_C_RECORDMCOUNT
select HAVE_DEBUG_BUGVERBOSE
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_API_DEBUG
select HAVE_DMA_ATTRS
select HAVE_DMA_CONTIGUOUS
select HAVE_EFFICIENT_UNALIGNED_ACCESS
+   select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_GENERIC_DMA_COHERENT
select HAVE_HW_BREAKPOINT if PERF_EVENTS
select HAVE_MEMBLOCK
diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c
index 9c22317..e11aa4a 100644
--- a/scripts/recordmcount.c
+++ b/scripts/recordmcount.c
@@ -40,6 +40,11 @@
 #define R_METAG_NONE 3
 #endif
 
+#ifndef EM_AARCH64
+#define EM_AARCH64 183
+#define R_AARCH64_ABS64257
+#endif
+
 static int fd_map; /* File descriptor for file being modified. */
 static int mmap_failed; /* Boolean flag. */
 static void *ehdr_curr; /* current ElfXX_Ehdr *  for resource cleanup */
@@ -347,6 +352,8 @@ do_file(char const *const fname)
case EM_ARM: reltype = R_ARM_ABS32;
 altmcount = "__gnu_mcount_nc";
 break;
+   case EM_AARCH64:
+reltype = R_AARCH64_ABS64; gpfx = '_'; break;
case EM_IA_64:   reltype = R_IA64_IMM64;   gpfx = '_'; break;
case EM_METAG:   reltype = R_METAG_ADDR32;
 altmcount = "_mcount_wrapper";
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 91280b8..397b6b8 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -279,6 +279,11 @@ if ($arch eq "x86_64") {
 $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_ARM_(CALL|PC24|THM_CALL)" .
"\\s+(__gnu_mcount_nc|mcount)\$";
 
+} elsif ($arch eq "arm64") {
+$alignment = 3;
+$section_type = '%progbits';
+$mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_AARCH64_CALL26\\s+_mcount\$";
+$type = ".quad";
 } elsif ($arch eq "ia64") {
 $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$";
 $type = "data8";
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v7 2/7] arm64: Add 'notrace' attribute to unwind_frame() for ftrace

2014-03-14 Thread AKASHI Takahiro

walk_stackframe() calls unwind_frame(), and if walk_stackframe() is
"notrace", unwind_frame() should be also "notrace".

Acked-by: Will Deacon 
Signed-off-by: AKASHI Takahiro 
---
 arch/arm64/kernel/stacktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index c3b6c63..54122c4 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -35,7 +35,7 @@
  * ldp x29, x30, [sp]
  * add sp, sp, #0x10
  */
-int unwind_frame(struct stackframe *frame)
+int notrace unwind_frame(struct stackframe *frame)
 {
unsigned long high, low;
unsigned long fp = frame->fp;
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v7 1/7] arm64: add ASSEMBLY in asm/insn.h

2014-03-14 Thread AKASHI Takahiro

Since insn.h is indirectly included in asm/entry-ftrace.S,
we need to exclude some declarations by __ASSEMBLY__.

Acked-by: Will Deacon 
Signed-off-by: AKASHI Takahiro 
---
 arch/arm64/include/asm/insn.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index c44ad39..dc1f73b 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -21,6 +21,7 @@
 /* A64 instructions are always 32 bits. */
 #defineAARCH64_INSN_SIZE   4
 
+#ifndef __ASSEMBLY__
 /*
  * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a
  * Section C3.1 "A64 instruction index by encoding":
@@ -104,5 +105,6 @@ bool aarch64_insn_hotpatch_safe(u32 old_insn, u32 new_insn);
 int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
 int aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt);
 int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
+#endif /* __ASSEMBLY__ */
 
 #endif /* __ASM_INSN_H */
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v7 0/7] arm64: Add ftrace support

2014-03-14 Thread AKASHI Takahiro

This patchset implements a function tracer on arm64.
There was another implementation from Cavium network, but both of us agreed
to use my patchset as future base. He is supposed to review this code, too.

The only issue that I had some concern on was "fault protection" code
in prepare_ftrace_return(). With discussions with Steven and Tim (as author
of arm ftrace), I removed that code since I'm not quite sure about possibility
of "fault" occurrences in this function.

The code is tested on ARMv8 Fast Model with the following tracers & events:
 function tracer with dynamic ftrace
 function graph tracer with dynamic ftrace
 syscall tracepoint (but only for AArch64 tasks)
 irqsoff & preemptirqsoff (which use CALLER_ADDRx)
and also verified with in-kernel tests, FTRACE_SELFTEST, FTRACE_STARTUP_TEST
and EVENT_TRACE_TEST_SYSCALLS.

Prerequisites are:
 * "arm64: make a single hook to syscall_trace() for all syscall features" patch
 * "arm64: split syscall_trace() into separate functions for enter/exit" patch
 * "arm64: Add regs_return_value() in syscall.h" patch
 * "arm64: is_compat_task is defined both in asm/compat.h and
linux/compat.h" patch

Please be careful:
* Patch [3/7] gets warnings from checkpatch, but they are based on the
  original's coding style.
* Patch [7/7] may conflict with my audit patch because both changes the same
  location in syscall_trace_enter/exit(). I expect the functions are called
  in this order:
  On entry,
 * tracehook_report_syscall(ENTER)
 * trace_sys_enter()
 * audit_syscall_entry()
  On exit,
 * audit_sysscall_exit()
 * trace_sys_exit()
 * tracehook_report_syscall(EXIT) 

Changes from v6 to v7:
* changed to use gpfx variable instead of defining altmcount in recordmcount.c
  [3/7]
* declared return_to_handler using ENTRY/END macros [4/7]
* changed to use u32 instead of int as instruction words, and simplified
  ftrace_modify_graph_caller() [5/7]
* simplified arch_trace_is_compat_call() for readability [7/7]
* added the following patch to prerequisite list,
  "arm64: is_compat_task is defined both in asm/compat.h and linux/compat.h"
  and changed to use linux/compat.h instead of asm/compat.h in asm/ftrace.h
  to avoid compile errors against some files (ie. do_mounts.c and etc) if
  FTRACE & !COMPAT. [7/7]

Changes from v5 to v6:
* changed the order of patches to avoid any bisect error (I have not tried
  though)
* added EM_AARCH64 and R_AARCH64_ABS64 definitions in scripts/recordmcount.c
  just in case elf.h on host machine doesn't have them. [3/7]
* updated a frame pointer (x29) in _mcount() to make it look like a normal
  function [4/7]
* aligned with the patch, "arm64: split syscall_trace() into separate
  functions for enter/exit" [7/7]
* defined ARCH_TRACE_IGNORE_COMPAT_SYSCALLS in order not to trace compat
  syscalls [7/7]

Chnages from v4 to v5:
* improved the description of stack layout [1/7]
* aligned with the change in "arm64: make a single hook to syscall_trace()
  for all syscall features" v3 [5/7]

Changes from v3 to v4:
* removed unnecessary "#ifdef" [1,2/7]
* changed stack depth from 48B to 16B in mcount()/ftrace_caller() (a bug) [1/7]
* changed MCOUNT_INSN_SIZE to AARCH64_INSN_SIZE [1,7/7]
* added a guard againt TIF_SYSCALL_TRACEPOINT [5/7]
* corrected the second argument passed to trace_sys_exit() (a bug) [5/7]
* aligned with the change in "arm64: make a single hook to syscall_trace()
  for all syscall features" v2 [5/7]

Changes from v2 to v3:
* optimized register usages in asm (by not saving x0, x1, and x2)
* removed "fault protection" code in prepare_ftrace_return()
* rewrote ftrace_modify_code() using "hotpatch" interfaces
* revised descriptions in comments

Changes from v1 to v2:
* splitted one patch into some pieces for easier review
  (especially function tracer + dynamic ftrace + CALLER_ADDRx)
* put return_address() in a separate file
* renamed __mcount to _mcount (it was my mistake)
* changed stackframe handling to get parent's frame pointer
* removed ARCH_SUPPORTS_FTRACE_OPS
* switched to "hotpatch" interfaces from Huawai
* revised descriptions in comments

AKASHI Takahiro (7):
  arm64: add __ASSEMBLY__ in asm/insn.h
  arm64: Add 'notrace' attribute to unwind_frame() for ftrace
  ftrace: Add arm64 support to recordmcount
  arm64: Add ftrace support
  arm64: ftrace: Add dynamic ftrace support
  arm64: ftrace: Add CALLER_ADDRx macros
  arm64: ftrace: Add system call tracepoint

 arch/arm64/Kconfig |   6 +
 arch/arm64/include/asm/ftrace.h|  67 
 arch/arm64/include/asm/insn.h  |   2 +
 arch/arm64/include/asm/syscall.h   |   1 +
 arch/arm64/include/asm/unistd.h|   2 +
 arch/arm64/kernel/Makefile |   7 +-
 arch/arm64/kernel/arm64ksyms.c |   4 +
 arch/arm64/kernel/entry-ftrace.S   | 218 +
 arch/arm64/kernel/ftrace.c | 178 ++
 arch/arm64/kernel/ptrace.c |   9 ++

[PATCH v5 3/4] arm64: Add regs_return_value() in syscall.h

2014-03-14 Thread AKASHI Takahiro

This macro, regs_return_value, is used mainly for audit to record system
call's results, but may also be used in test_kprobes.c.

Acked-by: Will Deacon 
Acked-by: Richard Guy Briggs 
Signed-off-by: AKASHI Takahiro 
---
 arch/arm64/include/asm/ptrace.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 0e7fa49..5800ec1 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -134,6 +134,11 @@ struct pt_regs {
 #define user_stack_pointer(regs) \
((regs)->sp)
 
+static inline unsigned long regs_return_value(struct pt_regs *regs)
+{
+   return regs->regs[0];
+}
+
 /*
  * Are the current registers suitable for user mode? (used to maintain
  * security in signal handlers)
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v5 2/4] arm64: split syscall_trace() into separate functions for enter/exit

2014-03-14 Thread AKASHI Takahiro

As done in arm, this change makes it easy to confirm we invoke syscall
related hooks, including syscall tracepoint, audit and seccomp which would
be implemented later, in correct order. That is, undoing operations in the
opposite order on exit that they were done on entry.

Signed-off-by: AKASHI Takahiro 
---
 arch/arm64/kernel/entry.S  | 10 --
 arch/arm64/kernel/ptrace.c | 50 +++---
 2 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index f9f2cae..00d6eb9 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -649,9 +649,8 @@ ENDPROC(el0_svc)
 * switches, and waiting for our parent to respond.
 */
 __sys_trace:
-   mov x1, sp
-   mov w0, #0  // trace entry
-   bl  syscall_trace
+   mov x0, sp
+   bl  syscall_trace_enter
adr lr, __sys_trace_return  // return address
uxtwscno, w0// syscall number (possibly new)
mov x1, sp  // pointer to regs
@@ -666,9 +665,8 @@ __sys_trace:
 
 __sys_trace_return:
str x0, [sp]// save returned x0
-   mov x1, sp
-   mov w0, #1  // trace exit
-   bl  syscall_trace
+   mov x0, sp
+   bl  syscall_trace_exit
b   ret_to_user
 
 /*
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 6a8928b..f606276 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -1058,35 +1058,43 @@ long arch_ptrace(struct task_struct *child, long 
request,
return ptrace_request(child, request, addr, data);
 }
 
-asmlinkage int syscall_trace(int dir, struct pt_regs *regs)
+enum ptrace_syscall_dir {
+   PTRACE_SYSCALL_ENTER = 0,
+   PTRACE_SYSCALL_EXIT,
+};
+
+static void tracehook_report_syscall(struct pt_regs *regs,
+enum ptrace_syscall_dir dir)
 {
+   int scrach;
unsigned long saved_reg;
 
-   if (!test_thread_flag(TIF_SYSCALL_TRACE))
-   return regs->syscallno;
-
-   if (is_compat_task()) {
-   /* AArch32 uses ip (r12) for scratch */
-   saved_reg = regs->regs[12];
-   regs->regs[12] = dir;
-   } else {
-   /*
-* Save X7. X7 is used to denote syscall entry/exit:
-*   X7 = 0 -> entry, = 1 -> exit
-*/
-   saved_reg = regs->regs[7];
-   regs->regs[7] = dir;
-   }
+   /*
+* A scrach register (ip(r12) on AArch32, x7 on AArch64) is
+* used to denote syscall entry/exit:
+*/
+   scrach = (is_compat_task() ? 12 : 7);
+   saved_reg = regs->regs[scrach];
+   regs->regs[scrach] = dir;
 
-   if (dir)
+   if (dir == PTRACE_SYSCALL_EXIT)
tracehook_report_syscall_exit(regs, 0);
else if (tracehook_report_syscall_entry(regs))
regs->syscallno = ~0UL;
 
-   if (is_compat_task())
-   regs->regs[12] = saved_reg;
-   else
-   regs->regs[7] = saved_reg;
+   regs->regs[scrach] = saved_reg;
+}
+
+asmlinkage int syscall_trace_enter(struct pt_regs *regs)
+{
+   if (test_thread_flag(TIF_SYSCALL_TRACE))
+   tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
 
return regs->syscallno;
 }
+
+asmlinkage void syscall_trace_exit(struct pt_regs *regs)
+{
+   if (test_thread_flag(TIF_SYSCALL_TRACE))
+   tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT);
+}
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v5 0/4] arm64: prerequisites for audit and ftrace

2014-03-14 Thread AKASHI Takahiro

This patchset contains some patches commonly applied for audit and ftrace.

Patch [1/4] defines syscall trace related TIF_* flags in order to add hooks,
including ftrace, audit and seccomp, later on.  Those features will be
implemented in separate patchsets, but it's safe to check for all TIF_*
now because they can not be turned on anyway.

Patch [2/4] doesn't change a behavior but make it easy and manageable to
confirm we invoke those hooks in correct order by splitting syscall_trace().

Patch [3/4] adds a commonly used function, which returns a return value of
system call.

Patch [4/4] removes is_compat_task from asm/compat.h to avoid conflicted
definitions.

Changes v4 -> v5:
* added the following patch from my seccomp patch since it is required for
  audit and ftrace in case of !COMPAT, too. [4/4]
  "arm64: is_compat_task is defined both in asm/compat.h and linux/compat.h"

Changes v3 -> v4:
* added "arm64: split syscall_trace() into separate functions for enter/
  exit", which is just a preparation for adding syscall trace hooks later.

Changes v2 -> v3:
* reverted a change in syscall_trace() in v1 [1/2]
* added "arm64: Add regs_return_value() in syscall.h" patch which was
  previously included in audit patch [2/2]

Changes v1 -> v2:
* added a guard against TIF_SYSCALL_TRACE at tracehook_report_syscall_*()
* renamed _TIF_WORK_SYSCALL to _TIF_SYSCALL_WORK

AKASHI Takahiro (4):
  arm64: make a single hook to syscall_trace() for all syscall features
  arm64: split syscall_trace() into separate functions for enter/exit
  arm64: Add regs_return_value() in syscall.h
  arm64: is_compat_task is defined both in asm/compat.h and
linux/compat.h

 arch/arm64/include/asm/compat.h  |  5 
 arch/arm64/include/asm/ptrace.h  |  5 
 arch/arm64/include/asm/thread_info.h | 13 +
 arch/arm64/kernel/entry.S| 15 +--
 arch/arm64/kernel/hw_breakpoint.c|  2 +-
 arch/arm64/kernel/process.c  |  1 +
 arch/arm64/kernel/ptrace.c   | 51 +---
 arch/arm64/kernel/signal.c   |  2 +-
 8 files changed, 58 insertions(+), 36 deletions(-)

-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v5 4/4] arm64: is_compat_task is defined both in asm/compat.h and linux/compat.h

2014-03-14 Thread AKASHI Takahiro

Some kernel files may include both linux/compat.h and asm/compat.h directly
or indirectly. Since both header files contain is_compat_task() under
!CONFIG_COMPAT, compiling them with !CONFIG_COMPAT will eventually fail.
Such files include kernel/auditsc.c, kernel/seccomp.c and init/do_mountfs.c
(do_mountfs.c may read asm/compat.h via asm/ftrace.h once ftrace is
implemented).

So this patch proactively
1) removes is_compat_task() under !CONFIG_COMPAT from asm/compat.h
2) replaces asm/compat.h to linux/compat.h in kernel/*.c,
   but asm/compat.h is still necessary in ptrace.c and process.c because
   they use is_compat_thread().

Signed-off-by: AKASHI Takahiro 
---
 arch/arm64/include/asm/compat.h   | 5 -
 arch/arm64/kernel/hw_breakpoint.c | 2 +-
 arch/arm64/kernel/process.c   | 1 +
 arch/arm64/kernel/ptrace.c| 1 +
 arch/arm64/kernel/signal.c| 2 +-
 5 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index fda2704..3b334f9 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -305,11 +305,6 @@ static inline int is_compat_thread(struct thread_info 
*thread)
 
 #else /* !CONFIG_COMPAT */
 
-static inline int is_compat_task(void)
-{
-   return 0;
-}
-
 static inline int is_compat_thread(struct thread_info *thread)
 {
return 0;
diff --git a/arch/arm64/kernel/hw_breakpoint.c 
b/arch/arm64/kernel/hw_breakpoint.c
index f17f581..a45e2db 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -20,6 +20,7 @@
 
 #define pr_fmt(fmt) "hw-breakpoint: " fmt
 
+#include 
 #include 
 #include 
 #include 
@@ -27,7 +28,6 @@
 #include 
 #include 
 
-#include 
 #include 
 #include 
 #include 
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 1c0a9be..fc8a387 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -20,6 +20,7 @@
 
 #include 
 
+#include 
 #include 
 #include 
 #include 
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index f606276..c47a3ed 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -19,6 +19,7 @@
  * along with this program.  If not, see .
  */
 
+#include 
 #include 
 #include 
 #include 
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 890a591..4a09989 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -17,6 +17,7 @@
  * along with this program.  If not, see .
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -25,7 +26,6 @@
 #include 
 #include 
 
-#include 
 #include 
 #include 
 #include 
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: build failure after merge of the driver-core tree

2014-03-14 Thread Greg KH

On Sat, Mar 15, 2014 at 01:57:29PM +1100, Benjamin Herrenschmidt wrote:
> On Sat, 2014-03-15 at 00:03 +, Greg KH wrote:
> > On Fri, Mar 14, 2014 at 09:14:55AM +1100, Benjamin Herrenschmidt wrote:
> > > On Thu, 2014-03-13 at 11:37 +1100, Benjamin Herrenschmidt wrote:
> > > > On Wed, 2014-03-12 at 16:21 -0400, Tejun Heo wrote:
> > > > > It's a series of rather complex patches.  I really don't think
> > > > > duplicating them is a good idea.  We can either resurrect the old API
> > > > > to kill it again or set up a merge branch which I don't think is too
> > > > > unusual in situations like this.
> > > > 
> > > > Right, a topic branch that gets merged in both driver-core-next and
> > > > powerpc-next.
> > > 
> > > Just want to make sure we agree ... ie, the offending commit is already
> > > in powerpc-next on my side and I can't really back it out (I could
> > > revert it though).
> > 
> > You can pull in driver-core-next into your tree if you want, it's not
> > going to be reverted, and will be sent to Linus for 3.15-rc1, so you can
> > base your work on it and fix up the api usage in your tree that way.
> 
> It's messy. Stephen really doesn't like if we pull each other trees like
> that unless they are topic branches. He also doesn't like when we keep
> pulling Linus in.

I only pull Linus in after a -rc in which I have merged patches with him
for that "topic".  Otherwise I end up with merge issues, and for testing
reasons, I want those fixes from Linus and from me, in order to keep
people from hitting the same already-fixes issues.

> For example I purposefully kept powerpc -next on top of rc2. You seem to
> regularly merge subsequent rc's into driver-core-next. So by pulling
> your tree I would bring a whole lot of stuff on top of mine, which is
> fine by git but makes histories more complicated and annoys Stephen.
> 
> I might still do it this time around, because the other solution for me
> is revert + re-apply with fixups on top of a separate branch itself
> derived from driver-core-next and send multiple pull requests to Linus,
> and that's messy too. The question is which one is more :-)

Just take my tree, it's not a big deal, I'll merge first with Linus if
you want and then everything is simple.

thanks,

greg k-h-
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 1/2] Staging: comedi: convert while loops to timeouts in s626.c

2014-03-14 Thread gre...@linuxfoundation.org

On Fri, Mar 14, 2014 at 06:43:37PM -0700, Chase Southwood wrote:
> >On Tuesday, March 11, 2014 9:26 AM, Ian Abbott  wrote:
> 
> >>On 2014-03-09 04:00, Chase Southwood wrote:
> >> This patch changes a handful of while loops to timeouts to prevent
> >> infinite looping on hardware failure. A couple such loops are in a
> >> function (s626_debi_transfer()) which is called from critical sections,
> >> so comedi_timeout() is unusable for them, and an iterative timeout is
> >> used instead. For the while loops in a context where comedi_timeout() is
> >> allowed, a new callback function, s626_send_dac_eoc(), has been defined
> >> to evaluate the conditions that the while loops are testing.  The new
> >> callback employs a switch statement based on a simple new enum so that
> >> it is usable for all of the different conditions tested in while loops
> >> in s626_send_dac().  The proper comedi_timeout() calls are then used.
> >>
> >> Signed-off-by: Chase Southwood 
> >> ---
> >> Ian, here is a version of this patchset employing the enum you recommended.
> >> The second patch has been rebased on top of this one.
> >>
> >> 2: Used comedi_timeout() where appropriate, introduce callback function
> >>
> >> 3: Updated callback to switch on new enum.>
> >
> >Reviewed-by: Ian Abbott 
> >
> >For future reference, for patches affecting a single comedi driver, we 
> >usually title the patches like this:
> >
> >staging: comedi: name_of_driver: summary of patch
> >
> 
> 
> Hi Greg!
> 
> I was just writing to inquire whether you were able to add this patch as well 
> as
> PATCH 2/2 Propagate timeout errors in s626.c, to your queue in their current 
> state.
> I had to resend this patch to you about a week ago because the subject line 
> got
> a little messed up, which might have lead to a bit of confusion regarding the 
> 2
> patch series, and I wanted to check in to see whether you need me to do 
> anything
> further.

I've been on vacation this week and will dig through my huge patch queue
next week.  Then I will need another vacation...

Give me a chance to catch up, I'll let you know if I have problems with
them.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH RFC 0/9] socket filtering using nf_tables

2014-03-14 Thread Alexei Starovoitov

On Fri, Mar 14, 2014 at 11:16 AM, Pablo Neira Ayuso  wrote:
> On Fri, Mar 14, 2014 at 08:28:05AM -0700, Alexei Starovoitov wrote:
>> On Thu, Mar 13, 2014 at 5:29 AM, Pablo Neira Ayuso  
>> wrote:
>> > On Wed, Mar 12, 2014 at 08:29:07PM -0700, Alexei Starovoitov wrote:
>> >> On Wed, Mar 12, 2014 at 2:15 AM, Pablo Neira Ayuso  
>> >> wrote:
>> > [...]
>>
>> It seems you're assuming that ebpf inherited all the shortcomings
>> of bpf and making conclusion based on that. Not your fault.
>> I didn't explain it well enough.
>
> Frankly, from the *interface point of view* it is indeed inheriting
> all of its shortcomings...

Hi Pablo, David,

Let's go back to what ebpf is...
ebpf == generalization of assembler instructions across different architectures.

Take x86_64 instructions ld/st/alu/mov/cmp/call/jmp and
then rename them into my_ld, my_st, my_add, my_call, etc
Then do the same for arm64.
Also rename register names into r0,r1,r2
and remember how you did the mapping.
Also analyze x86_64, arm64 call convention, so that callee saved
registers are mapped to the same regs and arguments are passed
in r1, r2, ...

A function call in such assembler will look like:
my_mov r1, 1
my_mov r2, 2
my_call foo

that maps back to x86_64:
mov rdi, 1
mov rsi, 2
call foo

Since calling convention is compatible between 'renamed assembler'
and original x86_64 or arm assembler, the program written in 'renamed
assembler' can call native functions directly.
The opposite is also true.
Functions written in x86 assembler or C can call into functions
written in 'renamed' assembler.
Example:

f1.s:
 mov rdi, 1
 mov rsi, 2
 call f2
 ret

f2.s:
  my_mov r3, r1
  my_mov r2, r1
  my_mov r1, r3
  my_call f3
  my_ret

f3.s:
  mov rax, rdi
  sub  rax, rsi
  ret

fyi, in C these assembler blobs roughly do:
u64 f1() { return f2(1,2); }
u64 f2(u64 a, u64 b) { return f3(b, a); }
u64 f3(u64 a, u64 b) { return a - b; }

f1.s and f3.s are written in x86_64 and f2.s is written in 'renamed assembler'.

compile f1.s, f3.s into binary x86 code
compile f2.s into some binary code
(either fixed insn size or variable, that's irrelevant), let's call it format B

Now load all three binary blobs into kernel.
1st and 3rd blob can be loaded as-is.
2nd blob needs to be remapped from format B into x86_64 binary code.

After that CPU can call f1() and receive 1 back.

What programs can be written in x86_64 assembler? Anything.
What programs can be written in renamed assembler? Anything.

How often do we want to extend x86_64 assembler? Rarely.
Only when an algorithm implemented in pure x86_64 needs
mmx/ssa acceleration.
Intel does not extend x86 to add a feature, but to accelerate a feature.
Same with 'renamed' assembler.
Any algorithm can be implemented using renamed assembler.

So what is ebpf? It's a format B. It can be fixed size or variable.
That is irrelevant. While loading, the program in format B is
reverse mapped into x86 binary code.

What programs can be written in format B? Anything.
Does format B needs to be extended to support nft? no
to support socket filters? no
to support tracing filters? no
to support crazy idea that will come N years from now? no
Hard to believe? Think back that it is renamed x86 assembler.

Format B was chosen to look like bpf to make an adoption easier
and to make conversion from bpf to ebpf trivial,
but looks like it was a bad idea.
I should have just called it 'simplified x86_64 insn set'.

Now about 'user interface point of view'...
old bpf, netlink, nft format are interfaces.
Until format B is exposed to user space it is not an interface.
nftables can use format B to jit the code.
nftables's user interface doesn't change.

In the patches I sent, ebpf is _not_ exposed to the user.
My patch set immediately helps performance of existing
socket filters and seccomp.
And can do jitting for nft.

Another way of thinking about ebpf:
ebpf is a different way of encoding x86 insns.

I also think we can expose ebpf to the user space,
but that's a different patch and a different discussion.

Thanks!

Hi Pablo,

now back to our discussion:

>> Technically ebpf is a small evolution of bpf, but applicability made a
>> giant leap. I cannot compile C into bpf, but I can do that with ebpf.
>
> Good that we can get better userspace tools, but still the layout of
> your instructions is exposed to userspace so it cannot be changed
> *ever*. The kernel interface is still an array of binary structures of
> fixed size just like BPF does.

that fixed size is irrelevant from extensibility point of view.
sparc has fixed size instructions too, but we don't change sparc
instruction width.
Let's say we decided to remap all sparc instructions and add new
pseudo instructions. These pseudo sparc insns won't buy us any
performance, because in the end they're remapped into real
instructions that cpu can execute.
These fake new pseudo sparc instructions won't give us
any new features either.

Format B should not be changed.
We can add new instructions if we

[PATCH] virtio-blk: make the queue depth the max supportable by the hypervisor

2014-03-14 Thread Theodore Ts'o

The current virtio block sets a queue depth of 64, which is
insufficient for very fast devices.  It has been demonstrated that
with a high IOPS device, using a queue depth of 256 can double the
IOPS which can be sustained.

As suggested by Venkatash Srinivas, set the queue depth by default to
be one half the the device's virtqueue, which is the maximum queue
depth that can be supported by the channel to the host OS (each I/O
request requires at least two VQ entries).

Also allow the queue depth to be something which can be set at module
load time or via a kernel boot-time parameter, for
testing/benchmarking purposes.

Signed-off-by: "Theodore Ts'o" 
Signed-off-by: Venkatesh Srinivas 
Cc: Rusty Russell 
Cc: "Michael S. Tsirkin" 
Cc: virtio-...@lists.oasis-open.org
Cc: virtualizat...@lists.linux-foundation.org
Cc: Frank Swiderski 
---

This is a combination of my patch and Vekatash's patch.  I agree that
setting the default automatically is better than requiring the user to
set the value by hand.

 drivers/block/virtio_blk.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 6a680d4..0f70c01 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -481,6 +481,9 @@ static struct blk_mq_ops virtio_mq_ops = {
.free_hctx  = blk_mq_free_single_hw_queue,
 };
 
+static int queue_depth = -1;
+module_param(queue_depth, int, 0444);
+
 static struct blk_mq_reg virtio_mq_reg = {
.ops= _mq_ops,
.nr_hw_queues   = 1,
@@ -551,9 +554,14 @@ static int virtblk_probe(struct virtio_device *vdev)
goto out_free_vq;
}
 
+   virtio_mq_reg.queue_depth = queue_depth > 0 ? queue_depth :
+   (vblk->vq->num_free / 2);
virtio_mq_reg.cmd_size =
sizeof(struct virtblk_req) +
sizeof(struct scatterlist) * sg_elems;
+   virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
+   pr_info("%s: using queue depth %d\n", vblk->disk->disk_name,
+   virtio_mq_reg.queue_depth);
 
q = vblk->disk->queue = blk_mq_init_queue(_mq_reg, vblk);
if (!q) {
@@ -565,8 +573,6 @@ static int virtblk_probe(struct virtio_device *vdev)
 
q->queuedata = vblk;
 
-   virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
-
vblk->disk->major = major;
vblk->disk->first_minor = index_to_minor(index);
vblk->disk->private_data = vblk;
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] staging: comedi: fix memory leak

2014-03-14 Thread Chase Southwood

>On Friday, March 14, 2014 11:47 AM, Levente Kurusa  wrote:

>Call kfree() on bdev. The variable is otherwise leaked.
>
>Signed-off-by: Levente Kurusa 
>---
>drivers/staging/comedi/drivers/comedi_bond.c | 1 +
>1 file changed, 1 insertion(+)
>
>diff --git a/drivers/staging/comedi/drivers/comedi_bond.c 
>>b/drivers/staging/comedi/drivers/comedi_bond.c
>index 51a59e5..406aedb 100644
>--- a/drivers/staging/comedi/drivers/comedi_bond.c
>+++ b/drivers/staging/comedi/drivers/comedi_bond.c
>@@ -254,6 +254,7 @@ static int do_dev_config(struct comedi_device *dev, struct 
>comedi_devconfig *it)
>            if (!devs) {
>                dev_err(dev->class_dev,
>                    "Could not allocate memory. Out of memory?\n");
>+                kfree(bdev);
>                return -ENOMEM;
>            }
>            devpriv->devs = devs;
>-- 
>1.8.3.1
>


Levente,

This change has already been made in staging-next (by me, actually :) ).  In
order to avoid re-doing work which has already been done, please make sure to 
base
all of your patches off of linux-next (or for staging, staging-next).

Thanks,
Chase
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v4 1/4] pci: APM X-Gene PCIe controller driver

2014-03-14 Thread Tanmay Inamdar

Thanks for the review and comments. I will incorporate the comments
from you and Jingoo Han in next version.

-Tanmay

On Fri, Mar 14, 2014 at 5:18 AM, Arnd Bergmann  wrote:
> On Thursday 06 March 2014, Tanmay Inamdar wrote:
>
>> +static inline void xgene_pcie_cfg_out16(void __iomem *addr, u16 val)
>> +{
>> + u64 temp_addr = (u64)addr & ~0x3;
>
> Please use 'unsigned long' as the type for calculations like this one,
> to make the code more portable. You mentioned before that the same PCI
> host controller is used on some ppc4xx, and we may want to share the
> code later.
>
>> +static int xgene_pcie_read_config(struct pci_bus *bus, unsigned int devfn,
>> +   int offset, int len, u32 *val)
>> +{
>> + struct xgene_pcie_port *port = bus->sysdata;
>> + void __iomem *addr;
>> + u8 val8;
>> + u16 val16;
>> +
>> + if ((pci_is_root_bus(bus) && devfn != 0) || !port->link_up)
>> + return PCIBIOS_DEVICE_NOT_FOUND;
>> +
>> + xgene_pcie_set_rtdid_reg(bus, devfn);
>> + addr = xgene_pcie_get_cfg_base(bus);
>> + switch (len) {
>> + case 1:
>> + xgene_pcie_cfg_in8(addr + offset, );
>> + *val = val8;
>> + break;
>
> Actually it would be better to just pass both addr and offset
> down into the low-level accessors and then do the calculation
> on 'offset', which is already a scalar.
>
>> +static void xgene_pcie_poll_linkup(struct xgene_pcie_port *port,
>> +u32 *lanes, u32 *speed)
>> +{
>> + void __iomem *csr_base = port->csr_base;
>> + u32 val32;
>> + u64 start_time, time;
>> +
>> + /*
>> +  * A component enters the LTSSM Detect state within
>> +  * 20ms of the end of fundamental core reset.
>> +  */
>> + msleep(XGENE_LTSSM_DETECT_WAIT);
>> + port->link_up = 0;
>> + start_time = jiffies;
>> + do {
>> + val32 = readl(csr_base + PCIECORE_CTLANDSTATUS);
>> + if (val32 & LINK_UP_MASK) {
>> + port->link_up = 1;
>> + *speed = PIPE_PHY_RATE_RD(val32);
>> + val32 = readl(csr_base + BRIDGE_STATUS_0);
>> + *lanes = val32 >> 26;
>> + break;
>> + }
>> + msleep(1);
>> + time = jiffies_to_msecs(jiffies - start_time);
>> + } while (time <= XGENE_LTSSM_L0_WAIT);
>> +}
>
> This can be written ina simpler way using 'time_before()'.
>
>> +/* Return 0 on success */
>> +static int xgene_pcie_init_ecc(struct xgene_pcie_port *port)
>> +{
>> + void __iomem *csr_base = port->csr_base;
>> + u64 start_time, time = 0;
>> + u32 val;
>> +
>> + val = readl(csr_base + MEM_RAM_SHUTDOWN);
>> + if (!val)
>> + return 0;
>> + writel(0x0, csr_base + MEM_RAM_SHUTDOWN);
>> + start_time = jiffies;
>> + do {
>> + val = readl(csr_base + BLOCK_MEM_RDY);
>> + if (val == BLOCK_MEM_RDY_VAL)
>> + break;
>> + msleep(1);
>> + time = jiffies_to_msecs(jiffies - start_time);
>> + } while (time < XGENE_PCIE_ECC_TIMEOUT);
>
> Same here.
>
>> +static int xgene_pcie_init_port(struct xgene_pcie_port *port)
>> +{
>> + int rc;
>> +
>> + port->clk = clk_get(port->dev, NULL);
>> + if (IS_ERR_OR_NULL(port->clk)) {
>> + dev_err(port->dev, "clock not available\n");
>> + return -ENODEV;
>> + }
>
> Practically every use of IS_ERR_OR_NULL() is a bug, don't do that.
> NULL is a valid return code from clk_get(), and should not be
> treated as an error.
>
>
>> +static int xgene_pcie_map_ranges(struct xgene_pcie_port *port,
>> +  struct pci_host_bridge *bridge,
>> +  u64 cfg_addr)
>> +{
>> + struct device *dev = port->dev;
>> + struct pci_host_bridge_window *window;
>> +
>> + list_for_each_entry(window, >windows, list) {
>> + struct resource *res = window->res;
>> + u64 restype = resource_type(res);
>> + dev_dbg(port->dev, "0x%08lx 0x%016llx...0x%016llx\n",
>> + res->flags, res->start, res->end);
>> +
>> + switch (restype) {
>> + case IORESOURCE_IO:
>> + xgene_pcie_setup_ob_reg(port, res, OMR2BARL,
>> + bridge->io_base);
>> + BUG_ON(pci_ioremap_io(res, bridge->io_base) < 0);
>> + break;
>
> No need to BUG_ON() here, this is not a fatal condition, just
> don't register the I/O space resource if this fails.
>
> I think as the PCI base support patch series evolves, you will actually
> not have to do this at all.
>
> Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at

Re: [PATCH v4 2/4] arm64: dts: APM X-Gene PCIe device tree nodes

2014-03-14 Thread Tanmay Inamdar

On Fri, Mar 14, 2014 at 5:07 AM, Arnd Bergmann  wrote:
> On Thursday 06 March 2014, Tanmay Inamdar wrote:
>> + pcie0: pcie@1f2b {
>> + status = "disabled";
>> + device_type = "pci";
>> + compatible = "apm,xgene-storm-pcie", "apm,xgene-pcie";
>> + #interrupt-cells = <1>;
>> + #size-cells = <2>;
>> + #address-cells = <3>;
>> + reg = < 0x00 0x1f2b 0x0 0x0001   /* Controller 
>> registers */
>> + 0xe0 0xd000 0x0 0x0020>; /* PCI config 
>> space */
>> + reg-names = "csr", "cfg";
>> + ranges = <0x0100 0x00 0x 0xe0 0x 
>> 0x00 0x0001   /* io */
>> +   0x0200 0x00 0x1000 0xe0 0x1000 
>> 0x00 0x8000>; /* mem */
>> + dma-ranges = <0x4200 0x40 0x 0x40 
>> 0x 0x40 0x>;
>> + interrupt-map-mask = <0x0 0x0 0x0 0x7>;
>> + interrupt-map = <0x0 0x0 0x0 0x1  0x0 0xc2 0x1
>> +  0x0 0x0 0x0 0x2  0x0 0xc3 0x1
>> +  0x0 0x0 0x0 0x3  0x0 0xc4 0x1
>> +  0x0 0x0 0x0 0x4  0x0 0xc5 0x1>;
>> + clocks = < 0>;
>> + };
>
> Is 0x40.0x the start of your RAM? I had expected RAM to start at 0.0,
> and in that case the dma-ranges property would be wrong.

RAM starting address is 0x40_.

>
> Arnd
> CONFIDENTIALITY NOTICE: This e-mail message, including any attachments,
> is for the sole use of the intended recipient(s) and contains information
> that is confidential and proprietary to Applied Micro Circuits Corporation or 
> its subsidiaries.
> It is to be used solely for the purpose of furthering the parties' business 
> relationship.
> All unauthorized review, use, disclosure or distribution is prohibited.
> If you are not the intended recipient, please contact the sender by reply 
> e-mail
> and destroy all copies of the original message.
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/6] mm/memory-failure.c: report and recovery for memory error on dirty pagecache

2014-03-14 Thread Andi Kleen

On Thu, Mar 13, 2014 at 05:39:42PM -0400, Naoya Horiguchi wrote:
> Unifying error reporting between memory error and normal IO errors is ideal
> in a long run, but at first let's solve it separately. I hope that some code
> in this patch will be helpful when thinking of the unification.

The mechanisms should be very similar, right? 

It may be better to do both at the same time.

> index 60829565e552..1e8966919044 100644
> --- v3.14-rc6.orig/include/linux/fs.h
> +++ v3.14-rc6/include/linux/fs.h
> @@ -475,6 +475,9 @@ struct block_device {
>  #define PAGECACHE_TAG_DIRTY  0
>  #define PAGECACHE_TAG_WRITEBACK  1
>  #define PAGECACHE_TAG_TOWRITE2
> +#ifdef CONFIG_MEMORY_FAILURE
> +#define PAGECACHE_TAG_HWPOISON   3
> +#endif

No need to ifdef defines

> @@ -1133,6 +1139,10 @@ static void do_generic_file_read(struct file *filp, 
> loff_t *ppos,
>   if (unlikely(page == NULL))
>   goto no_cached_page;
>   }
> + if (unlikely(PageHWPoison(page))) {
> + error = -EHWPOISON;
> + goto readpage_error;
> + }

Didn't we need this check before independent of the rest of the patch?

>   if (PageReadahead(page)) {
>   page_cache_async_readahead(mapping,
>   ra, filp, page,
> @@ -2100,6 +2110,10 @@ inline int generic_write_checks(struct file *file, 
> loff_t *pos, size_t *count, i
>  if (unlikely(*pos < 0))
>  return -EINVAL;
>  
> + if (unlikely(mapping_hwpoisoned_range(file->f_mapping, *pos,
> +   *pos + *count)))
> + return -EHWPOISON;

How expensive is that check? This will happen on every write.
Can it be somehow combined with the normal page cache lookup?

>   * Dirty pagecache page
> + *
> + * Memory error reporting (important especially on dirty pagecache error
> + * because dirty data is lost) with AS_EIO flag has some problems:

It doesn't make sense to have changelogs in comments. That is what
git is for.  At some point noone will care about the previous code.

> + * To solve these, we handle dirty pagecache errors by replacing the error

This part of the comment is good.

> + pgoff_t index;
> + struct inode *inode = NULL;
> + struct page *new;
>  
>   SetPageError(p);
> - /* TBD: print more information about the file. */
>   if (mapping) {
> + index = page_index(p);
> + /*
> +  * we take inode refcount to keep it's pagecache or mapping
> +  * on the memory until the error is resolved.

How does that work? Who "resolves" the error? 

> +  */
> + inode = igrab(mapping->host);
> + pr_info("MCE %#lx: memory error on dirty pagecache (page 
> offset:%lu, inode:%lu, dev:%s)\n",

Add the word file somewhere, you need to explain this in terms normal
sysadmins and not only kernel hackers can understand.

-Andi

-- 
a...@linux.intel.com -- Speaking for myself only.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/4] perf kmem: introduce --list-cmds for use by scripts

2014-03-14 Thread Ramkumar Ramachandra

Signed-off-by: Ramkumar Ramachandra 
---
 tools/perf/builtin-kmem.c | 8 +---
 tools/perf/perf-completion.sh | 4 ++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 929462a..bd91de0 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -756,11 +756,13 @@ int cmd_kmem(int argc, const char **argv, const char 
*prefix __maybe_unused)
OPT_BOOLEAN(0, "raw-ip", _ip, "show raw ip instead of symbol"),
OPT_END()
};
-   const char * const kmem_usage[] = {
-   "perf kmem [] {record|stat}",
+   const char *const kmem_subcommands[] = { "record", "stat", NULL };
+   const char *kmem_usage[] = {
+   NULL,
NULL
};
-   argc = parse_options(argc, argv, kmem_options, kmem_usage, 0);
+   argc = parse_options_subcommand(argc, argv, kmem_options,
+   kmem_subcommands, kmem_usage, 0);
 
if (!argc)
usage_with_options(kmem_usage, kmem_options);
diff --git a/tools/perf/perf-completion.sh b/tools/perf/perf-completion.sh
index ae3a576..0ef59dd 100644
--- a/tools/perf/perf-completion.sh
+++ b/tools/perf/perf-completion.sh
@@ -121,8 +121,8 @@ __perf_main ()
elif [[ $prev == "-e" && "${words[1]}" == @(record|stat|top) ]]; then
evts=$($cmd list --raw-dump)
__perfcomp_colon "$evts" "$cur"
-   # List subcommands for 'perf kvm'
-   elif [[ $prev == "kvm" ]]; then
+   # List subcommands for perf commands
+   elif [[ $prev == @(kvm|kmem) ]]; then
subcmds=$($cmd $prev --list-cmds)
__perfcomp_colon "$subcmds" "$cur"
# List long option names
-- 
1.9.rc0.1.g9d22d25

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/4] perf: trivial follow-ons to --list-cmds

2014-03-14 Thread Ramkumar Ramachandra

Hi,

With "perf kvm: introduce --list-cmds for use by scripts" accepted,
these are trivial follow-on patches to enable the same functionality
in kmem, mem, lock, and sched.

Thanks.

Ramkumar Ramachandra (4):
  perf kmem: introduce --list-cmds for use by scripts
  perf mem: introduce --list-cmds for use by scripts
  perf lock: introduce --list-cmds for use by scripts
  perf sched: introduce --list-cmds for use by scripts

 tools/perf/builtin-kmem.c |  8 +---
 tools/perf/builtin-lock.c | 10 ++
 tools/perf/builtin-mem.c  | 15 ---
 tools/perf/builtin-sched.c| 10 ++
 tools/perf/perf-completion.sh |  4 ++--
 5 files changed, 27 insertions(+), 20 deletions(-)

-- 
1.9.rc0.1.g9d22d25

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/4] perf sched: introduce --list-cmds for use by scripts

2014-03-14 Thread Ramkumar Ramachandra

Signed-off-by: Ramkumar Ramachandra 
---
 tools/perf/builtin-sched.c| 10 ++
 tools/perf/perf-completion.sh |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 6a76a07..347bd32 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1713,8 +1713,10 @@ int cmd_sched(int argc, const char **argv, const char 
*prefix __maybe_unused)
"perf sched replay []",
NULL
};
-   const char * const sched_usage[] = {
-   "perf sched [] {record|latency|map|replay|script}",
+   const char *const sched_subcommands[] = { "record", "latency", "map",
+ "replay", "script", NULL };
+   const char *sched_usage[] = {
+   NULL,
NULL
};
struct trace_sched_handler lat_ops  = {
@@ -1736,8 +1738,8 @@ int cmd_sched(int argc, const char **argv, const char 
*prefix __maybe_unused)
for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
sched.curr_pid[i] = -1;
 
-   argc = parse_options(argc, argv, sched_options, sched_usage,
-PARSE_OPT_STOP_AT_NON_OPTION);
+   argc = parse_options_subcommand(argc, argv, sched_options, 
sched_subcommands,
+   sched_usage, 
PARSE_OPT_STOP_AT_NON_OPTION);
if (!argc)
usage_with_options(sched_usage, sched_options);
 
diff --git a/tools/perf/perf-completion.sh b/tools/perf/perf-completion.sh
index ecedab6..3356984 100644
--- a/tools/perf/perf-completion.sh
+++ b/tools/perf/perf-completion.sh
@@ -122,7 +122,7 @@ __perf_main ()
evts=$($cmd list --raw-dump)
__perfcomp_colon "$evts" "$cur"
# List subcommands for perf commands
-   elif [[ $prev == @(kvm|kmem|mem|lock) ]]; then
+   elif [[ $prev == @(kvm|kmem|mem|lock|sched) ]]; then
subcmds=$($cmd $prev --list-cmds)
__perfcomp_colon "$subcmds" "$cur"
# List long option names
-- 
1.9.rc0.1.g9d22d25

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/4] perf lock: introduce --list-cmds for use by scripts

2014-03-14 Thread Ramkumar Ramachandra

Signed-off-by: Ramkumar Ramachandra 
---
 tools/perf/builtin-lock.c | 10 ++
 tools/perf/perf-completion.sh |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index c852c7a..6148afc 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -961,8 +961,10 @@ int cmd_lock(int argc, const char **argv, const char 
*prefix __maybe_unused)
"perf lock info []",
NULL
};
-   const char * const lock_usage[] = {
-   "perf lock [] {record|report|script|info}",
+   const char *const lock_subcommands[] = { "record", "report", "script",
+"info", NULL };
+   const char *lock_usage[] = {
+   NULL,
NULL
};
const char * const report_usage[] = {
@@ -976,8 +978,8 @@ int cmd_lock(int argc, const char **argv, const char 
*prefix __maybe_unused)
for (i = 0; i < LOCKHASH_SIZE; i++)
INIT_LIST_HEAD(lockhash_table + i);
 
-   argc = parse_options(argc, argv, lock_options, lock_usage,
-PARSE_OPT_STOP_AT_NON_OPTION);
+   argc = parse_options_subcommand(argc, argv, lock_options, 
lock_subcommands,
+   lock_usage, 
PARSE_OPT_STOP_AT_NON_OPTION);
if (!argc)
usage_with_options(lock_usage, lock_options);
 
diff --git a/tools/perf/perf-completion.sh b/tools/perf/perf-completion.sh
index f44c04b..ecedab6 100644
--- a/tools/perf/perf-completion.sh
+++ b/tools/perf/perf-completion.sh
@@ -122,7 +122,7 @@ __perf_main ()
evts=$($cmd list --raw-dump)
__perfcomp_colon "$evts" "$cur"
# List subcommands for perf commands
-   elif [[ $prev == @(kvm|kmem|mem) ]]; then
+   elif [[ $prev == @(kvm|kmem|mem|lock) ]]; then
subcmds=$($cmd $prev --list-cmds)
__perfcomp_colon "$subcmds" "$cur"
# List long option names
-- 
1.9.rc0.1.g9d22d25

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/4] perf mem: introduce --list-cmds for use by scripts

2014-03-14 Thread Ramkumar Ramachandra

Signed-off-by: Ramkumar Ramachandra 
---
 tools/perf/builtin-mem.c  | 15 ---
 tools/perf/perf-completion.sh |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 2e3ade69..4a1a6c9 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -21,11 +21,6 @@ struct perf_mem {
DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 };
 
-static const char * const mem_usage[] = {
-   "perf mem [] {record  |report}",
-   NULL
-};
-
 static int __cmd_record(int argc, const char **argv)
 {
int rec_argc, i = 0, j;
@@ -220,9 +215,15 @@ int cmd_mem(int argc, const char **argv, const char 
*prefix __maybe_unused)
   " between columns '.' is reserved."),
OPT_END()
};
+   const char *const mem_subcommands[] = { "record", "report", NULL };
+   const char *mem_usage[] = {
+   NULL,
+   NULL
+   };
+
 
-   argc = parse_options(argc, argv, mem_options, mem_usage,
-PARSE_OPT_STOP_AT_NON_OPTION);
+   argc = parse_options_subcommand(argc, argv, mem_options, 
mem_subcommands,
+   mem_usage, 
PARSE_OPT_STOP_AT_NON_OPTION);
 
if (!argc || !(strncmp(argv[0], "rec", 3) || mem_operation))
usage_with_options(mem_usage, mem_options);
diff --git a/tools/perf/perf-completion.sh b/tools/perf/perf-completion.sh
index 0ef59dd..f44c04b 100644
--- a/tools/perf/perf-completion.sh
+++ b/tools/perf/perf-completion.sh
@@ -122,7 +122,7 @@ __perf_main ()
evts=$($cmd list --raw-dump)
__perfcomp_colon "$evts" "$cur"
# List subcommands for perf commands
-   elif [[ $prev == @(kvm|kmem) ]]; then
+   elif [[ $prev == @(kvm|kmem|mem) ]]; then
subcmds=$($cmd $prev --list-cmds)
__perfcomp_colon "$subcmds" "$cur"
# List long option names
-- 
1.9.rc0.1.g9d22d25

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: numa: Recheck for transhuge pages under lock during protection changes

2014-03-14 Thread Sasha Levin


On 03/12/2014 06:36 AM, Mel Gorman wrote:

Andrew, this should go with the patches
mmnuma-reorganize-change_pmd_range.patch
mmnuma-reorganize-change_pmd_range-fix.patch
move-mmu-notifier-call-from-change_protection-to-change_pmd_range.patch
in mmotm please.

Thanks.

---8<---
From: Mel Gorman
Subject: [PATCH] mm: numa: Recheck for transhuge pages under lock during 
protection changes

Sasha Levin reported the following bug using trinity


I'm seeing a different issue with this patch. A NULL ptr deref occurs in the
pte_offset_map_lock() macro right before the new recheck code:

[ 1877.093980] BUG: unable to handle kernel NULL pointer dereference at 
0018
[ 1877.095174] IP: __lock_acquire+0xbc/0x5a0 (kernel/locking/lockdep.c:3069)
[ 1877.096069] PGD 6dee7a067 PUD 6dee7b067 PMD 0
[ 1877.096821] Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 1877.097706] Dumping ftrace buffer:
[ 1877.098281](ftrace buffer empty)
[ 1877.098825] Modules linked in:
[ 1877.099327] CPU: 19 PID: 27913 Comm: trinity-c100 Tainted: GW 
3.14.0-rc6-next-20140314-sasha-00012-g5590866 #219
[ 1877.100044] task: 8808f428 ti: 8806e1e54000 task.ti: 
8806e1e54000
[ 1877.100044] RIP:  __lock_acquire+0xbc/0x5a0 (kernel/locking/lockdep.c:3069)
[ 1877.100044] RSP: :8806e1e55be8  EFLAGS: 00010002
[ 1877.100044] RAX: 0082 RBX: 0018 RCX: 
[ 1877.100044] RDX:  RSI:  RDI: 0018
[ 1877.100044] RBP: 8806e1e55c58 R08: 0001 R09: 
[ 1877.100044] R10: 0001 R11: 0001 R12: 8808f428
[ 1877.100044] R13:  R14:  R15: 0001
[ 1877.100044] FS:  7fe3fe152700() GS:88042ba0() 
knlGS:
[ 1877.100044] CS:  0010 DS:  ES:  CR0: 8005003b
[ 1877.100044] CR2: 0018 CR3: 0006dee79000 CR4: 06a0
[ 1877.100044] DR0: 00698000 DR1: 00698000 DR2: 00698000
[ 1877.100044] DR3:  DR6: 0ff0 DR7: 0009060a
[ 1877.100044] Stack:
[ 1877.100044]  8806e1e55c18 81184e95 8808f4280038 
001d8500
[ 1877.100044]  88042bbd8500 0013 8806e1e55c48 
81185108
[ 1877.100044]  87c13bd0 8808f428  
0001
[ 1877.100044] Call Trace:
[ 1877.100044]  ? sched_clock_local+0x25/0x90 (kernel/sched/clock.c:205)
[ 1877.100044]  ? sched_clock_cpu+0xb8/0x100 (kernel/sched/clock.c:310)
[ 1877.100044]  lock_acquire+0x182/0x1d0 (arch/x86/include/asm/current.h:14 
kernel/locking/lockdep.c:3602)
[ 1877.100044]  ? change_pte_range+0xa3/0x410 (mm/mprotect.c:55)
[ 1877.100044]  ? __lock_release+0x1e2/0x200 (kernel/locking/lockdep.c:3506)
[ 1877.100044]  _raw_spin_lock+0x40/0x80 (include/linux/spinlock_api_smp.h:143 
kernel/locking/spinlock.c:151)
[ 1877.100044]  ? change_pte_range+0xa3/0x410 (mm/mprotect.c:55)
[ 1877.100044]  ? _raw_spin_unlock+0x35/0x60 (arch/x86/include/asm/preempt.h:98 
include/linux/spinlock_api_smp.h:152 kernel/locking/spinlock.c:183)
[ 1877.100044]  change_pte_range+0xa3/0x410 (mm/mprotect.c:55)
[ 1877.100044]  change_protection_range+0x3a8/0x4d0 (mm/mprotect.c:164 
mm/mprotect.c:188 mm/mprotect.c:213)
[ 1877.100044]  ? preempt_count_sub+0xe2/0x120 (kernel/sched/core.c:2529)
[ 1877.100044]  change_protection+0x25/0x30 (mm/mprotect.c:237)
[ 1877.100044]  change_prot_numa+0x1b/0x30 (mm/mempolicy.c:559)
[ 1877.100044]  task_numa_work+0x279/0x360 (kernel/sched/fair.c:1911)
[ 1877.100044]  task_work_run+0xae/0xf0 (kernel/task_work.c:125)
[ 1877.100044]  do_notify_resume+0x8e/0xe0 (include/linux/tracehook.h:196 
arch/x86/kernel/signal.c:751)
[ 1877.100044]  retint_signal+0x4d/0x92 (arch/x86/kernel/entry_64.S:1096)
[ 1877.100044] Code: c2 6f 3b 6d 85 be fa 0b 00 00 48 c7 c7 ce 94 6d 85 e8 f9 78 f9 
ff 31 c0 e9 bc 04 00 00 66 90 44 8b 1d 29 69 cd 04 45 85 db 74 0c <48> 81 3b 80 
f2 75 87 75 06 0f 1f 00 45 31 c0 83 fe 01 77 0c 89
[ 1877.100044] RIP  __lock_acquire+0xbc/0x5a0 (kernel/locking/lockdep.c:3069)
[ 1877.100044]  RSP 
[ 1877.100044] CR2: 0018


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] mm: munlock: fix a bug where THP tail page is encountered

2014-03-14 Thread Sasha Levin


On 03/14/2014 07:55 PM, Sasha Levin wrote:

On 12/17/2013 08:00 AM, Vlastimil Babka wrote:

From: Vlastimil Babka
Date: Fri, 13 Dec 2013 14:25:21 +0100
Subject: [PATCH 1/3] mm: munlock: fix a bug where THP tail page is encountered

Since commit ff6a6da60 ("mm: accelerate munlock() treatment of THP pages")
munlock skips tail pages of a munlocked THP page. However, when the head page
already has PageMlocked unset, it will not skip the tail pages.

Commit 7225522bb ("mm: munlock: batch non-THP page isolation and
munlock+putback using pagevec") has added a PageTransHuge() check which
contains VM_BUG_ON(PageTail(page)). Sasha Levin found this triggered using
trinity, on the first tail page of a THP page without PageMlocked flag.

This patch fixes the issue by skipping tail pages also in the case when
PageMlocked flag is unset. There is still a possibility of race with THP page
split between clearing PageMlocked and determining how many pages to skip.
The race might result in former tail pages not being skipped, which is however
no longer a bug, as during the skip the PageTail flags are cleared.

However this race also affects correctness of NR_MLOCK accounting, which is to
be fixed in a separate patch.


I've hit the same thing again, on the latest -next, this time with a different 
trace:

[  539.199120] page:ea0013249a80 count:0 mapcount:1 mapping:  
(null) index:0x0
[  539.200429] page flags: 0x12f80008000(tail)
[  539.201167] [ cut here ]
[  539.201889] kernel BUG at include/linux/page-flags.h:415!
[  539.202859] invalid opcode:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
[  539.204588] Dumping ftrace buffer:
[  539.206415](ftrace buffer empty)
[  539.207022] Modules linked in:
[  539.207503] CPU: 3 PID: 18262 Comm: trinity-c228 Tainted: GW 
3.14.0-rc6-next-20140313-sasha-00010-gb8c1db1-dirty #217
[  539.209012] task: 880627b1 ti: 8805a44c2000 task.ti: 
8805a44c2000
[  539.209989] RIP:  munlock_vma_pages_range+0x93/0x1d0 
(include/linux/page-flags.h:415 mm/mlock.c:494)
[  539.210263] RSP: :8805a44c3e08  EFLAGS: 00010246
[  539.210263] RAX: 88052ae126a0 RBX: 0006a000 RCX: 0099
[  539.210263] RDX:  RSI: 880627b10cf0 RDI: 04c926a0
[  539.210263] RBP: 8805a44c3ec8 R08: 0001 R09: 0001
[  539.210263] R10: 0001 R11: 0001 R12: ea0013249a80
[  539.210263] R13: 88039dc95a00 R14: 0006b000 R15: 8805a44c3e94
[  539.210263] FS:  7fd6ce14a700() GS:88042b80() 
knlGS:
[  539.210263] CS:  0010 DS:  ES:  CR0: 8005003b
[  539.210263] CR2: 7fd6ce0ef6ac CR3: 0006025cd000 CR4: 06a0
[  539.210263] DR0: 00698000 DR1:  DR2: 
[  539.210263] DR3:  DR6: 0ff0 DR7: 0600
[  539.210263] Stack:
[  539.210263]    00018805a44c3e38 

[  539.210263]   88039dc95a00 a44c3e88 

[  539.210263]  00ff8805a44c3e58 880528f0a0f0 8805a44c3eb8 
88039dc95a00
[  539.210263] Call Trace:
[  539.210263]  do_munmap+0x1d2/0x360 (mm/internal.h:168 mm/mmap.c:2547)
[  539.210263]  ? down_write+0xa6/0xc0 (kernel/locking/rwsem.c:51)
[  539.210263]  ? vm_munmap+0x46/0x80 (mm/mmap.c:2571)
[  539.210263]  vm_munmap+0x54/0x80 (mm/mmap.c:2572)
[  539.210263]  SyS_munmap+0x2c/0x40 (mm/mmap.c:2577)
[  539.210263]  tracesys+0xdd/0xe2 (arch/x86/kernel/entry_64.S:749)
[  539.210263] Code: ff 49 89 c4 48 85 c0 0f 84 f3 00 00 00 48 3d 00 f0 ff ff 0f 87 
e7 00 00 00 48 8b 00 66 85 c0 79 17 31 f6 4c 89 e7 e8 4d d2 fc ff <0f> 0b 0f 1f 
00 eb fe 66 0f 1f 44 00 00 49 8b 04 24 f6 c4 40 74
[  539.210263] RIP  munlock_vma_pages_range+0x93/0x1d0 
(include/linux/page-flags.h:415 mm/mlock.c:494)
[  539.210263]  RSP 
[  539.23] ---[ end trace 4e90dc9141579181 ]---


Thanks,
Sasha


And another related trace:

[  741.192502] kernel BUG at mm/mlock.c:528!
[  741.193088] invalid opcode:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
[  741.194177] Dumping ftrace buffer:
[  741.194645](ftrace buffer empty)
[  741.195109] Modules linked in:
[  741.195728] CPU: 23 PID: 19908 Comm: trinity-c264 Tainted: G    W 
3.14.0-rc6-next-20140314-sasha-00012-g5590866 #219
[  741.197549] task: 88061fc2b000 ti: 8805decb8000 task.ti: 
8805decb8000
[  741.198548] RIP:  munlock_vma_pages_range+0x176/0x1d0 (mm/mlock.c:528)
[  741.199754] RSP: 0018:8805decb9e08  EFLAGS: 00010206
[  741.200085] RAX: 01ff RBX: 00111000 RCX: 
[  741.200085] RDX: 0111 RSI: 81295fdd RDI: 84490705
[  741.200085] RBP: 8805decb9ec8 R08:  R09: 
[  741.200085] R10: 0001 R11:  R12: fff2
[  741.200085]

Re: linux-next: build failure after merge of the driver-core tree

2014-03-14 Thread Benjamin Herrenschmidt

On Sat, 2014-03-15 at 00:03 +, Greg KH wrote:
> On Fri, Mar 14, 2014 at 09:14:55AM +1100, Benjamin Herrenschmidt wrote:
> > On Thu, 2014-03-13 at 11:37 +1100, Benjamin Herrenschmidt wrote:
> > > On Wed, 2014-03-12 at 16:21 -0400, Tejun Heo wrote:
> > > > It's a series of rather complex patches.  I really don't think
> > > > duplicating them is a good idea.  We can either resurrect the old API
> > > > to kill it again or set up a merge branch which I don't think is too
> > > > unusual in situations like this.
> > > 
> > > Right, a topic branch that gets merged in both driver-core-next and
> > > powerpc-next.
> > 
> > Just want to make sure we agree ... ie, the offending commit is already
> > in powerpc-next on my side and I can't really back it out (I could
> > revert it though).
> 
> You can pull in driver-core-next into your tree if you want, it's not
> going to be reverted, and will be sent to Linus for 3.15-rc1, so you can
> base your work on it and fix up the api usage in your tree that way.

It's messy. Stephen really doesn't like if we pull each other trees like
that unless they are topic branches. He also doesn't like when we keep
pulling Linus in.

For example I purposefully kept powerpc -next on top of rc2. You seem to
regularly merge subsequent rc's into driver-core-next. So by pulling
your tree I would bring a whole lot of stuff on top of mine, which is
fine by git but makes histories more complicated and annoys Stephen.

I might still do it this time around, because the other solution for me
is revert + re-apply with fixups on top of a separate branch itself
derived from driver-core-next and send multiple pull requests to Linus,
and that's messy too. The question is which one is more :-)

Cheers,
Ben.

> thanks,
> 
> greg k-h
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[no subject]

2014-03-14 Thread Christian Organization


Good day

We are Christian Organization, we give out loan to  those who are
dedicated christians contact us via email, marieloanlend...@gmail.com

Regard

Mrs Marie



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: pull request: wireless 2014-03-14

2014-03-14 Thread David Miller

From: "John W. Linville" 
Date: Fri, 14 Mar 2014 14:20:26 -0400

> Please pull these last(?) few wireless bits intended for the 3.14
> stream.  Each is here to address a problem found with a patch already
> merged...
> 
> Dave Jones gives us a memory leak fix, for an error path in brcmfmac.
> 
> Felix Fietkau moves a small delay to make it actually reachable.
> 
> Helmut Schaa fixes an ath9k sequence numbering problem for non-data
> frames.
> 
> Stanislaw Gruszka reverts an earlier fix that was found to cause
> random connection drops on RT5390 PCI adapters
> 
> Please let me know if there are problems!  I'll be back from watching
> the Veronica Mars movie by about 9pm... :-)

:-)

Pulled, thanks John.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] net: phy: fix uninitalized ethtool_wolinfo in phy_suspend

2014-03-14 Thread David Miller

From: Sebastian Hesselbarth 
Date: Fri, 14 Mar 2014 10:07:44 +0100

> Callers of phy_ethtool_get_wol are supposed to provide a properly
> cleared struct ethtool_wolinfo. Therefore, fix phy_suspend to clear
> it before passing it to phy_ethtool_get_wol.
> 
> Signed-off-by: Sebastian Hesselbarth 

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC PATCH] MAINTAINERS: Add linux.n...@intel.com to INTEL ETHERNET DRIVERS

2014-03-14 Thread David Miller

From: Joe Perches 
Date: Thu, 13 Mar 2014 10:11:45 -0700

> If this is added to the driver files, then maybe it's
> appropriate to add to MAINTAINERS as well.
> 
> Signed-off-by: Joe Perches 

Applied, thanks Joe.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 01/10] perf, tools: Add jsmn `jasmine' JSON parser

2014-03-14 Thread Andi Kleen

On Fri, Mar 14, 2014 at 04:41:31PM -0600, David Ahern wrote:
> On 3/14/14, 3:31 PM, Andi Kleen wrote:
> >@@ -374,6 +376,8 @@ LIB_OBJS += $(OUTPUT)util/stat.o
> >  LIB_OBJS += $(OUTPUT)util/record.o
> >  LIB_OBJS += $(OUTPUT)util/srcline.o
> >  LIB_OBJS += $(OUTPUT)util/data.o
> >+LIB_OBJS += $(OUTPUT)util/jsmn.o
> >+LIB_OBJS += $(OUTPUT)util/json.o
> >
> >  LIB_OBJS += $(OUTPUT)ui/setup.o
> >  LIB_OBJS += $(OUTPUT)ui/helpline.o
> >
> 
> CONFIG driven? Allow a user to omit this.

Why? It has no external dependencies. AFAIK that's the only reason for configs.

(unless you count the commands used by the shell script, but the basic parser
works fine even without the script)

-Andi
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH net-next 0/2] bonding: use correct ether type for alb

2014-03-14 Thread David Miller

From: Veaceslav Falico 
Date: Thu, 13 Mar 2014 12:41:56 +0100

> There have been reports that, while using the ETH_P_LOOP ether type
> (0x0060), the ether type is treated as its packet length.
> 
> To avoid that and to not break already existing apps - add new ether type
> ETH_P_LOOPBACK that contains the correct id - 0x9000.

Series applied, thanks Veaceslav.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 3.14-rc: /proc/acpi/battery gone?

2014-03-14 Thread Stefan Lippers-Hollmann

Hi

On Saturday 15 March 2014, Rafael J. Wysocki wrote:
> On Friday, March 14, 2014 06:14:12 PM Ilia Mirkin wrote:
> > On Fri, Mar 14, 2014 at 6:11 PM, Pavel Machek  wrote:
> > > On Fri 2014-03-14 17:29:41, Ilia Mirkin wrote:
> > >> On Fri, Mar 14, 2014 at 5:14 PM, Pavel Machek  wrote:
[...]
> > wmbattery
> > 
> > They have attempted to use the sysfs api, but apparently that
> > integration was done with an older version of that API. There's also
> > some attempt to get it to work with upower, but I couldn't figure out
> > how to make that work either on my (up-to-date gentoo) box. (TBH I
> > didn't spend more than an hour or two on it, so it may not be
> > impossible.)
> 
> Tianyu, can you please have a look at this?

Disclaimer, I've never used wmbattery so far.

The current upstream version (2.42, released in early december 2013) 
of wmbattery[1] no longer reads from /proc/acpi/ at all. Apparently
it changed to using upower by default, with non-default fall-backs for
reading from sysfs. 

The only change required for building upower with wmbattery 2.42 
appears to be a new build-dependency on libupower-glib-dev (at least 
on Debian, built from the upower source package). Given that this 
version is present in Debian testing and unstable, I'd assume that it's
supposed to work using upower, although I haven't confirmed that myself.

Judging from the Gentoo ebuild, you probably just have to add 
"sys-power/upower" to the RDEPEND variable and make sure to build
wmbattery 2.42; this is untested.

Regards
Stefan Lippers-Hollmann

[1] Homepage: http://kitenet.net/~joey/code/wmbattery/
Vcs-Git: git://git.kitenet.net/wmbattery
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] bridge: fix bridge root block on designated port

2014-03-14 Thread Luis R. Rodriguez

On Thu, Mar 13, 2014 at 03:16:23PM -0700, Stephen Hemminger wrote:
> On Wed, 12 Mar 2014 20:15:27 -0700
> "Luis R. Rodriguez"  wrote:
> 
> > --- a/net/bridge/br_private.h
> > +++ b/net/bridge/br_private.h
> > @@ -150,6 +150,7 @@ struct net_bridge_port
> > u8  priority;
> > u8  state;
> > u16 port_no;
> > +   boolroot_block_enabled;
> > unsigned char   topology_change_ack;
> 
> It seems a bit confusing to have both a ROOT_BLOCK flag in the
> data structure and and additional root_block_enabled flag.
> If nothing else it is a waste of space.

Indeed, however there is a use for it. Consider the case where we loop
over each port and check to see if its root blocked and need to tickle it
or the bridge. In the case that root port block was enabled before and
someone is lifting it the flag would be removed and therefore not on
but it was root blocked though and we need a way to keep track of that.

The flag then is a toggle for userspace, while the bool tells us about
the current state.

> Looks like you are changing the meaning slightly. 

Let me know in what way. I can't see it.

> is possible to have BR_ROOT_BLOCK set but !root_block_enabled? 

Yeah in the case a new request to set it to root block then
BR_ROOT_BLOCK would be set but root_block_enabled would not be set.

> and what about the inverse?

BR_ROOT_BLOCK would not be set when userspace wants to disable root
port block and root_block_enabled would be enabled in this case if
it used to be enabled. So yes, both are possible.

  Luis

pgpjXHByuXmjN.pgp
Description: PGP signature

Re: [PATCH v2 1/2] cpufreq: Add exit_prepare callback to cpufreq_driver interface

2014-03-14 Thread Rafael J. Wysocki

On Friday, March 14, 2014 02:03:56 PM dirk.brande...@gmail.com wrote:
> From: Dirk Brandewie 
> 
> This callback allows the driver to do clean up before the CPU is
> completely down and its state cannot be modified.  This is used
> by the intel_pstate driver to reduce the requested P state prior to
> the core going away.  This is required because the requested P state
> of the offline core is used to select the package P state. This
> effectively sets the floor package P state to the requested P state on
> the offline core.
> 
> Signed-off-by: Dirk Brandewie 
> ---
>  Documentation/cpu-freq/cpu-drivers.txt | 8 +++-
>  drivers/cpufreq/cpufreq.c  | 3 +++
>  include/linux/cpufreq.h| 1 +
>  3 files changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/cpu-freq/cpu-drivers.txt 
> b/Documentation/cpu-freq/cpu-drivers.txt
> index 8b1a445..79def80 100644
> --- a/Documentation/cpu-freq/cpu-drivers.txt
> +++ b/Documentation/cpu-freq/cpu-drivers.txt
> @@ -61,7 +61,13 @@ target_index   -   See below on the 
> differences.
>  
>  And optionally
>  
> -cpufreq_driver.exit -A pointer to a per-CPU cleanup function.
> +cpufreq_driver.exit -A pointer to a per-CPU cleanup
> + function called during CPU_POST_DEAD
> + phase of cpu hotplug process.
> +
> +cpufreq_driver.stop -A pointer to a per-CPU stop function
> + called during CPU_DOWN_PREPARE phase of
> + cpu hotplug process.
>  
>  cpufreq_driver.resume -  A pointer to a per-CPU resume function
>   which is called with interrupts disabled
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index cf485d9..0d430d7 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -1338,6 +1338,9 @@ static int __cpufreq_remove_dev_prepare(struct device 
> *dev,
>   }
>   }
>  
> + if (cpufreq_driver->stop)

What about doing

+   if (cpufreq_driver->setpolicy && cpufreq_driver->stop)

here instead?  That would make it clear where the new callback belongs.

If you're fine with that, I can make that change when applying the patch.

> + cpufreq_driver->stop(policy);
> +
>   return 0;
>  }
>  
> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index 4d89e0e..ff8db19 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -224,6 +224,7 @@ struct cpufreq_driver {
>   int (*bios_limit)   (int cpu, unsigned int *limit);
>  
>   int (*exit) (struct cpufreq_policy *policy);
> + int (*stop) (struct cpufreq_policy *policy);
>   int (*suspend)  (struct cpufreq_policy *policy);
>   int (*resume)   (struct cpufreq_policy *policy);
>   struct freq_attr**attr;
> 

-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/2] Add exit_prepare callback to the cpufreq_driver interface.

2014-03-14 Thread Rafael J. Wysocki

On Friday, March 14, 2014 11:29:04 AM Dirk Brandewie wrote:
> On 03/14/2014 10:07 AM, Viresh Kumar wrote:
> > On 14 March 2014 20:40, Dirk Brandewie  wrote:
> >> Are you proposing adding cpufreq_generic_suspend() to the core I can not
> >> find
> >> it in the mainline code.
> >
> > Its already there in linux-next. I am suggesting to reuse that
> > infrastructure with
> > some necessary modification to support both suspend and hotplug.
> 
> Suspend and hotplug are two very different things and if we start
> crossing those wires bad things are going to happen IMHO.
> 
> In "normal" operation using the suspend path to do this work could
> work in principal but doesn't handle the case where the user does
> echo 0 | sudo tee /sys/devices/system/cpu/cpuX/online
> 
> Trying force hotplug and suspend into a common mechanism would
> lead to a bunch of special case code or a significant rework of the
> core code IMHO.
> 
> 
> >
> >>> Over that I don't think Dirk's solution is going to work if we twist
> >>> the systems a bit.
> >>
> >> Could you explain what "twist the systems a bit" means?
> >
> > The one I explained in the below paragraph.
> >
> >>> For example, Dirk probably wanted to set P-STATE of every core to MIN
> >>> when it goes down. But his solution probably doesn't do that right now.
> >>>
> >>
> >> No, I wanted to set the core that was being off-lined to min P state.
> >
> > Sorry, probably my words were a bit confusing. I meant exactly what
> > you just wrote. Core going down will set its freq to min.
> >
> >>> As exit() is called only when the policy expires or all the CPUs of that
> >>> policy
> >>> are down. Suppose only one CPU out of 4 goes down from a policy, then
> >>> pstate driver would never know that happened. And that core wouldn't go
> >>> to min state.
> >>
> >> My patch does not change the semantics of exit() or when it is called.  For
> >> intel_pstate their is one cpu per policy so I moved all the cleanup to
> >
> > I didn't knew that its guaranteed by pstate driver. I thought it would 
> > still be
> > hardware dependent as some cores might share clock line.
> 
> This is guaranteed by the hardware.  Each core has its own MSR for P state
> request.  Any coordination that is required between cores to select the
> package P state is handled by the hardware.
> 
> >
> >> exit_prepare() if there were work I needed to do at CPU_POST_DEAD I would
> >> have
> >> continued to export the *optional* exit callback.
> >>
> >> The callback name exit_prepare() was probably unfortunate and might be
> >> causing
> >> some confusion.  I will be changing the name per Rafael's feedback.
> >
> > Don't know.. There is another problem here that exit_prepare() would be 
> > called
> > for each CPU whereas exit() would be called for each policy.
> 
> Granted but I don't see this as a problem in this case there is a 1:1
> relationship.  If a driver chooses to use the *optional* exit_prepare() 
> callback
> and knows that there is a many:1 relationship between the policy and CPUs
> then it would have to deal with it.

Actually, I think we should make it clear that the new callback is for
->setpolicy drivers only, which will make things a bit clearer.

We seem to get caught by the difference between ->setpolicy and ->target
drivers on a regular basis, so it might be a good idea to make the distinction
more clear in the code.  I have an idea how to do that, but need some time
to prototype it.

> > And I strongly feel that we shouldn't give another callback here but instead
> > just set core to a specific freq as mentioned by driver with some other 
> > field.
> >
> >>> I think we have two solutions here:
> >>> - If its just about setting core a particular freq when it goes down, I
> >>> think it
> >>> looks a generic enough problem and so better fix core for that. Probably
> >>> with
> >>> help of flags field/suspend-freq (maybe renamed) and without calling
> >>> drivers
> >>> exit() at all..
> >>
> >>
> >> ATM the only thing that needs to be done in this case is to allow
> >> intel_pstate
> >> to set the P state on the core when it is going done.  My solution from the
> >> cores point of view is more generic, it allows any driver that needs to do
> >> work
> >> during CPU_DOWN_PREPARE to do it without adding any new logic to the core.
> >
> > Yeah, do we really need to give that freedom right now? Would be better
> > to get this into core as that would be more generic and people looking to 
> > set
> > core to some freq at shutdown wouldn't be replicating that code.

Question is if it needs to be more generic.

I honestly don't think that ->target drivers will ever do anything like it,
because they need the governor to "exit" before.  So we are talking about the
only two ->setpolicy drivers in the tree here.

> IMHO yes and it would be hard to be more generic, if your platform needs to
> do architecture specific during the PREPARE phase of cpu hotplug use this
> callback or not.
> 
> BTW now that you have

Re: [PATCH v3 1/2] Staging: comedi: convert while loops to timeouts in s626.c

2014-03-14 Thread Chase Southwood

>On Tuesday, March 11, 2014 9:26 AM, Ian Abbott  wrote:

>>On 2014-03-09 04:00, Chase Southwood wrote:
>> This patch changes a handful of while loops to timeouts to prevent
>> infinite looping on hardware failure. A couple such loops are in a
>> function (s626_debi_transfer()) which is called from critical sections,
>> so comedi_timeout() is unusable for them, and an iterative timeout is
>> used instead. For the while loops in a context where comedi_timeout() is
>> allowed, a new callback function, s626_send_dac_eoc(), has been defined
>> to evaluate the conditions that the while loops are testing.  The new
>> callback employs a switch statement based on a simple new enum so that
>> it is usable for all of the different conditions tested in while loops
>> in s626_send_dac().  The proper comedi_timeout() calls are then used.
>>
>> Signed-off-by: Chase Southwood 
>> ---
>> Ian, here is a version of this patchset employing the enum you recommended.
>> The second patch has been rebased on top of this one.
>>
>> 2: Used comedi_timeout() where appropriate, introduce callback function
>>
>> 3: Updated callback to switch on new enum.>
>
>Reviewed-by: Ian Abbott 
>
>For future reference, for patches affecting a single comedi driver, we 
>usually title the patches like this:
>
>staging: comedi: name_of_driver: summary of patch
>


Hi Greg!

I was just writing to inquire whether you were able to add this patch as well as
PATCH 2/2 Propagate timeout errors in s626.c, to your queue in their current 
state.
I had to resend this patch to you about a week ago because the subject line got
a little messed up, which might have lead to a bit of confusion regarding the 2
patch series, and I wanted to check in to see whether you need me to do anything
further.

Thanks,
Chase
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] usb: gadget: fsl: Add FSL USB Gadget entry in platform device id

2014-03-14 Thread Felipe Balbi

On Fri, Mar 14, 2014 at 08:52:19PM +, suresh.gu...@freescale.com wrote:
> Hi,
> Thanks for reviewing my patches.
> Please find my comments inline
> 
> -Original Message-
> From: Felipe Balbi [mailto:ba...@ti.com] 
> Sent: Thursday, March 13, 2014 8:56 PM
> To: Gupta Suresh-B42813
> Cc: ba...@ti.com; gre...@linuxfoundation.org; linux-...@vger.kernel.org; 
> linux-kernel@vger.kernel.org; Gupta Suresh-B42813
> Subject: Re: [PATCH] usb: gadget: fsl: Add FSL USB Gadget entry in platform 
> device id
> 
> On Thu, Mar 13, 2014 at 07:35:31PM +0530, Suresh Gupta wrote:
> > From: Suresh Gupta 
> > 
> > Add FSL USB Gadget entry in platform device id table
> 
> why this tab ?
> [SuresH] I will remove it in next version. 
> 
> > Signed-off-by: Suresh Gupta 
> > ---
> >  drivers/usb/gadget/fsl_udc_core.c | 2 ++
> >  1 file changed, 2 insertions(+)
> > 
> > diff --git a/drivers/usb/gadget/fsl_udc_core.c 
> > b/drivers/usb/gadget/fsl_udc_core.c
> > index b7dea4e..35b20e6 100644
> > --- a/drivers/usb/gadget/fsl_udc_core.c
> > +++ b/drivers/usb/gadget/fsl_udc_core.c
> > @@ -2654,6 +2654,8 @@ static const struct platform_device_id 
> > fsl_udc_devtype[] = {
> > }, {
> > .name = "imx-udc-mx51",
> > }, {
> > +   .name = "fsl-usb2-udc",
> 
> why aren't you just using chipidea ?
> [SuresH] This is our legacy driver for all previous and existing ppc
> socs. Many of our customers are already using this, and we need to
> support them on this driver. We do have plans to shift to chipidea,
> but after some time. 

cool, you already have plans, so we will see a new glue layer for v3.16
right ? Which means I don't need to take this patch either.

-- 
balbi


signature.asc
Description: Digital signature

Re: [PATCH] USB : Gadget: fsl: add information message

2014-03-14 Thread Felipe Balbi

On Fri, Mar 14, 2014 at 08:52:49PM +, suresh.gu...@freescale.com wrote:
> Hi,
> 
> -Original Message-
> From: Felipe Balbi [mailto:ba...@ti.com] 
> Sent: Thursday, March 13, 2014 8:56 PM
> To: Gupta Suresh-B42813
> Cc: ba...@ti.com; gre...@linuxfoundation.org; linux-...@vger.kernel.org; 
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] USB : Gadget: fsl: add information message
> 
> On Thu, Mar 13, 2014 at 06:41:50PM +0530, Suresh Gupta wrote:
> > Message helps to understand that the Freescale Gadget driver
> > is up without any error.
> 
> why this tab ?
> [SuresH] I will remove it in next version 
> 
> > 
> > Signed-off-by: Suresh Gupta 
> > ---
> >  drivers/usb/gadget/fsl_udc_core.c | 1 +
> >  1 file changed, 1 insertion(+)
> > 
> > diff --git a/drivers/usb/gadget/fsl_udc_core.c 
> > b/drivers/usb/gadget/fsl_udc_core.c
> > index 9a93727..455d0ab 100644
> > --- a/drivers/usb/gadget/fsl_udc_core.c
> > +++ b/drivers/usb/gadget/fsl_udc_core.c
> > @@ -2503,6 +2503,7 @@ static int __init fsl_udc_probe(struct 
> > platform_device *pdev)
> > goto err_del_udc;
> >  
> > create_proc_file();
> > +   pr_info("%s (%s)\n", driver_desc, DRIVER_VERSION);
> 
> if there are no messages, then it's up. Also, you can very easily check lsmod.
> 
> sorry, but I'm not taking this.
> [SuresH] The legacy driver shows this message when module_init was
> used. To make same upbringing for customer, I added the same message
> after probe get  completed. This message will not make any difference
> but will satisfies customer need.   

#!/bin/sh

lsmod | grep fsl_usb2_udc

if [ $? -eq 0 ]; then
echo "Yeah, USB driver is loaded"
else
echo "Bad! Bad driver"
fi

Still not taking your patch

-- 
balbi


signature.asc
Description: Digital signature

Re: [PATCH 2/3] bridge: trigger a bridge calculation upon port changes

2014-03-14 Thread Luis R. Rodriguez

On Thu, Mar 13, 2014 at 11:26:25AM -0700, Cong Wang wrote:
> On Wed, Mar 12, 2014 at 8:15 PM, Luis R. Rodriguez
>  wrote:
> > spin_lock_bh(>br->lock);
> > err = br_setport(p, tb);
> > +   changed = br_stp_recalculate_bridge_id(p->br);
> 
> Looks like you only want to check if the mac addr gets changed here,

Nope, the reason why we want a full thorough check is that br_setport()
may change currently any of these:

  * IFLA_BRPORT_MODE
  * IFLA_BRPORT_GUARD
  * IFLA_BRPORT_FAST_LEAVE
  * IFLA_BRPORT_PROTECT
  * IFLA_BRPORT_LEARNING,
  * IFLA_BRPORT_UNICAST_FLOOD
  * IFLA_BRPORT_COST
  * IFLA_BRPORT_PRIORITY
  * IFLA_BRPORT_STATE

That's good reason to trigger a good inspection. Having the MAC address
change would be simply collateral and its just something we need to do
some additional work for outside of the locking context.

> but br_stp_recalculate_bridge_id() does more than just checking it,
> are you sure the side-effects are all what you want here?

Yeap.

> > spin_unlock_bh(>br->lock);
> > +   if (changed)
> > +   call_netdevice_notifiers(NETDEV_CHANGEADDR,
> > +p->br->dev);
> > +   netdev_update_features(p->br->dev);
> 
> I think this is supposed to be in netdev event handler of br->dev
> instead of here.

Do you mean netdev_update_features() ? I mimic'd what was being done on
br_del_if() given that root blocking is doing something similar. If
we need to change something for the above then I suppose it means we need
to change br_del_if() too. Let me know if you see any reason for something
else.

  Luis

pgp3qLaJhzs5L.pgp
Description: PGP signature

Re: [PATCH] USB: Gadget: fsl driver pullup fix

2014-03-14 Thread Felipe Balbi

Hi,

(first of all, please fix your email client, we need the quotation
marks. See Documentation/email-clients.txt)

On Fri, Mar 14, 2014 at 08:53:24PM +, suresh.gu...@freescale.com wrote:
> > On Thu, Mar 13, 2014 at 06:40:55PM +0530, Suresh Gupta wrote:
> > > Attached is a small fix for the fsl usb gadget driver. This fix the 
> > > driver in a way that the usb device will be only "pulled up" on 
> > > requests like other usb gadget drivers do.
> > > This is necessary, because the device information is not always 
> > > available until an application is up and running which provides this 
> > > datas.
> > > 
> > > Signed-off-by: Stefani Seibold 
> > > Signed-off-by: Suresh Gupta 
> > > ---
> > >  drivers/usb/gadget/fsl_udc_core.c | 38 
> > > +-
> > >  1 file changed, 21 insertions(+), 17 deletions(-)
> > > 
> > > diff --git a/drivers/usb/gadget/fsl_udc_core.c 
> > > b/drivers/usb/gadget/fsl_udc_core.c
> > > index 35cb972..9a93727 100644
> > > --- a/drivers/usb/gadget/fsl_udc_core.c
> > > +++ b/drivers/usb/gadget/fsl_udc_core.c
> > > @@ -153,6 +153,21 @@ static inline void fsl_set_accessors(struct 
> > > fsl_usb2_platform_data *pdata) {}
> > >  /
> > >   *   Internal Used Function
> > >  /
> > > +static int can_pullup(struct fsl_udc *udc) {
> > > + return udc->driver && udc->softconnect && udc->vbus_active; }
> > > +
> > > +static void set_pullup(struct fsl_udc *udc) {
> > > + if (can_pullup(udc))
> > > + fsl_writel((fsl_readl(_regs->usbcmd) | USB_CMD_RUN_STOP),
> > > + _regs->usbcmd);
> > > + else
> > > + fsl_writel((fsl_readl(_regs->usbcmd) & ~USB_CMD_RUN_STOP),
> > > + _regs->usbcmd);
> > > +}
> > 
> > why is this a "fix", you just re-factored some code into set_pullup().
> >
> [SuresH] I set udc->vbus_active and udc->softconnect to default value
> of 1 in struct_udc_setup. This was actual fix in this patch.  The

right, you see now why is it a problem to mix cleanups with fixes ? You
*never*, ever combine unrelated changes in a single patch. It makes it a
lot more difficult to see what you're actually changing. So, to start
with, this patch should (if it was correct) be split into two smaller
patches: one re-factoring the duplicated code into set_pullup() and
another which fixes vbus_active and softconnect flags.

But hang on, before you do that...

> can_pullup function return false when these values was not set and
> intern the code return without enabling the pullup and gadget
> controller stops. 

So here's you mistake: the idea of can_pullup() (and thus, vbus_active
and softconnect flags) is to tell the driver "we're connet to a host,
it's safe to connect your pullups".

When you set both those flags to true, you're telling the driver that
you, indeed, are connected to a host. This might not be true if you
first boot up your platform, load all drivers and only after connect the
cable. In essence, you're lying to your driver and, as my mommy used to
say, "nobody likes a liar, my boy".

Curret situation isn't very good either since the driver is assuming
that cable is only plugged after driver is loaded, so it won't cope very
well with situation where cable is first plugged, then you apply power
to your board.

What you *really* need to do here is ask the HW for initial states of
those flags. During your probe() routine - as the name says - you
probe your HW to check its state (or to initialize its state), then you
ask "Hey IP, is VBUS above session valid threshold ?" Then you use the
HW's reply to initialize both flags, the way you want.

cheers

-- 
balbi

signature.asc
Description: Digital signature

Re: 3.14-rc: /proc/acpi/battery gone?

2014-03-14 Thread Rafael J. Wysocki

On Friday, March 14, 2014 06:14:12 PM Ilia Mirkin wrote:
> On Fri, Mar 14, 2014 at 6:11 PM, Pavel Machek  wrote:
> > On Fri 2014-03-14 17:29:41, Ilia Mirkin wrote:
> >> On Fri, Mar 14, 2014 at 5:14 PM, Pavel Machek  wrote:
> >> > Hi!
> >> >
> >> > It seems /proc/acpi/battery interface is gone, and I don't see any
> >> > option to reintroduce it... what is going on?
> >>
> >> The interface went away in a semi-recent kernel release (3.13 or
> >> 3.12), breaking pretty much every battery app. (Admittedly the
> >> interface was marked as deprecated for quite some time, but that
> >> didn't stop everyone from using it and not caring about the new
> >> thing.) I've yet to find a windowmaker dock app that works with the
> >> current sysfs API :(
> >
> > Name one application it broke, and we'll get it reverted. It broke my
> > by-hand journalling, at the very least.
> 
> wmbattery
> 
> They have attempted to use the sysfs api, but apparently that
> integration was done with an older version of that API. There's also
> some attempt to get it to work with upower, but I couldn't figure out
> how to make that work either on my (up-to-date gentoo) box. (TBH I
> didn't spend more than an hour or two on it, so it may not be
> impossible.)

Tianyu, can you please have a look at this?

-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] mfd: bcm590xx: Fix type argument for module device table

2014-03-14 Thread Axel Lin

This fixes below build error.

FATAL: drivers/mfd/bcm590xx: sizeof(struct i2c_device_id)=24 is not a modulo of 
the size of section __mod_i2c_device_table=392.
Fix definition of struct i2c_device_id in mod_devicetable.h
make[1]: *** [__modpost] Error 1
make: *** [modules] Error 2

Signed-off-by: Axel Lin 
---
 drivers/mfd/bcm590xx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/bcm590xx.c b/drivers/mfd/bcm590xx.c
index 926a57e..e9a33c7 100644
--- a/drivers/mfd/bcm590xx.c
+++ b/drivers/mfd/bcm590xx.c
@@ -68,7 +68,7 @@ static const struct of_device_id bcm590xx_of_match[] = {
{ .compatible = "brcm,bcm59056" },
{ }
 };
-MODULE_DEVICE_TABLE(i2c, bcm590xx_of_match);
+MODULE_DEVICE_TABLE(of, bcm590xx_of_match);
 
 static const struct i2c_device_id bcm590xx_i2c_id[] = {
{ "bcm59056" },
-- 
1.8.1.2



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [PATCH V2 06/13] ACPI: introduce enumerable_id flag

2014-03-14 Thread Zhang, Rui



> -Original Message-
> From: Wysocki, Rafael J
> Sent: Saturday, March 15, 2014 9:03 AM
> To: Zhang, Rui
> Cc: linux-a...@vger.kernel.org; linux-kernel@vger.kernel.org;
> bhelg...@google.com; matthew.garr...@nebula.com;
> dmitry.torok...@gmail.com
> Subject: Re: [PATCH V2 06/13] ACPI: introduce enumerable_id flag
> Importance: High
> 
> On 3/13/2014 5:16 PM, Zhang Rui wrote:
> > Only certain kind of ACPI device objects can be enumerated via ACPI.
> > These ACPI device objects include
> > 1. ACPI device objects that have _HID control method.
> > 2. some ACPI device objects that have Linux specified HID strings.
> >
> > In order to distinguish those device objects from the others, a new
> > flag enumerable_id and a new function acpi_add_eid() are introduced
> in this patch.
> 
> I don't really like the name of the new flag.  What about calling it
> platform_id (it is supposed to indicate that the core should create a
> platform device for it)?
>
I concerned about the same problem, but could not get a better name.
Yes, platform_id sounds much better.

> > Currently, only devices with _HID method have this flag set.
> > And in the future, if a device that has Linux specified HID strings
> > wants to be enumerated to platform bus, acpi_add_eid() should be used
> 
> And what about calling the new function acpi_add_platform_id()
> accordingly?
>
Agreed.
 
> > instead of acpi_add_id() when adding its Linux specified HID string.
> 
> And I don't quite understand the last paragraph as a whole.  Is it
> supposed to mean "if you want platform devices to be created for device
> objects without _HID, use acpi_add_platform_id() when adding artificial
> Linux-specific ID strings to them"?
> 
Yes.
Currently, we use acpi_add_id() for devices like video, thermal, etc,
If we want to see them in platform bus, we just a one line change to
replace acpi_add_id() with acpi_add_platform_id().

Thanks,
rui

> > Signed-off-by: Zhang Rui 
> > ---
> >   drivers/acpi/scan.c |8 +++-
> >   include/acpi/acpi_bus.h |3 ++-
> >   2 files changed, 9 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index
> > 399257e..768f81d 100644
> > --- a/drivers/acpi/scan.c
> > +++ b/drivers/acpi/scan.c
> > @@ -1679,6 +1679,12 @@ static void acpi_add_id(struct acpi_device_pnp
> *pnp, const char *dev_id)
> > pnp->type.hardware_id = 1;
> >   }
> >
> > +static void acpi_add_eid(struct acpi_device_pnp *pnp, const char
> > +*dev_id) {
> > +   acpi_add_id(pnp, dev_id);
> > +   pnp->type.enumerable_id = 1;
> > +}
> > +
> >   /*
> >* Old IBM workstations have a DSDT bug wherein the SMBus object
> >* lacks the SMBUS01 HID and the methods do not have the necessary
> "_"
> > @@ -1729,7 +1735,7 @@ static void acpi_set_pnp_ids(acpi_handle handle,
> struct acpi_device_pnp *pnp,
> > }
> >
> > if (info->valid & ACPI_VALID_HID)
> > -   acpi_add_id(pnp, info->hardware_id.string);
> > +   acpi_add_eid(pnp, info->hardware_id.string);
> > if (info->valid & ACPI_VALID_CID) {
> > cid_list = >compatible_id_list;
> > for (i = 0; i < cid_list->count; i++) diff --git
> > a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index
> > 8c5e235..688ca44 100644
> > --- a/include/acpi/acpi_bus.h
> > +++ b/include/acpi/acpi_bus.h
> > @@ -217,7 +217,8 @@ struct acpi_hardware_id {
> >   struct acpi_pnp_type {
> > u32 hardware_id:1;
> > u32 bus_address:1;
> > -   u32 reserved:30;
> > +   u32 enumerable_id:1;
> > +   u32 reserved:29;
> >   };
> >
> >   struct acpi_device_pnp {

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC 4/5] clocksource: omap-timer: Introduce clocksource driver for OMAP SoCs

2014-03-14 Thread Joel Fernandes

On 03/14/2014 07:13 PM, Suman Anna wrote:
> Hi Joel,
> 
> On 03/13/2014 03:35 PM, Joel Fernandes wrote:
>> We introduce functions to initialize clocksource and clockevent, use
>> CLOCKSOURCE_OF_DECLARE to declare the clocksource, and handle the
>> clocksource
>> selection on a per-SoC basis (Currently only AM335x is supported).
>> Powering up
>> of the timer will be done with the help of the mach-omap layer function
>> that's
>> introduced earlier in the series.
>>
>> We make a local copy of dmtimer API for use by clocksource, the original
>> dmtimer API in plat-omap is kept as-is till the migration of all SoCs is
>> completed after which it can't be deleted.
>>
>> Signed-off-by: Joel Fernandes 
>> ---
>>   drivers/clocksource/Makefile |1 +
>>   drivers/clocksource/omap-timer.c | 1157
>> ++
>>   drivers/clocksource/omap-timer.h |  422 ++
>>   3 files changed, 1580 insertions(+)
>>   create mode 100644 drivers/clocksource/omap-timer.c
>>   create mode 100644 drivers/clocksource/omap-timer.h
>>
>> diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
>> index c7ca50a..2ffe698 100644
>> --- a/drivers/clocksource/Makefile
>> +++ b/drivers/clocksource/Makefile
>> @@ -37,3 +37,4 @@ obj-$(CONFIG_ARM_ARCH_TIMER)+= arm_arch_timer.o
>>   obj-$(CONFIG_ARM_GLOBAL_TIMER)+= arm_global_timer.o
>>   obj-$(CONFIG_CLKSRC_METAG_GENERIC)+= metag_generic.o
>>   obj-$(CONFIG_ARCH_HAS_TICK_BROADCAST)+= dummy_timer.o
>> +obj-y+= omap-timer.o
>> diff --git a/drivers/clocksource/omap-timer.c
>> b/drivers/clocksource/omap-timer.c
>> new file mode 100644
>> index 000..91593d8
>> --- /dev/null
>> +++ b/drivers/clocksource/omap-timer.c
>> @@ -0,0 +1,1157 @@
>> +/*
>> + * drivers/clocksource/omap-timer.c
>> + *
>> + * OMAP Dual-Mode Timers
>> + *
>> + * Copyright (C) 2014 Texas Instruments Incorporated - http://www.ti.com/
>> + * Joel Fernandes 
>> + * Tarun Kanti DebBarma 
>> + * Thara Gopinath 
>> + *
>> + * dmtimer adaptation to platform_driver.
>> + *
>> + * Copyright (C) 2005 Nokia Corporation
>> + * OMAP2 support by Juha Yrjola
>> + * API improvements and OMAP2 clock framework support by Timo Teras
>> + *
>> + * Copyright (C) 2014 Texas Instruments
>> + * Added OMAP4 support - Santosh Shilimkar 
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms of the GNU General Public License as published by the
>> + * Free Software Foundation; either version 2 of the License, or (at your
>> + * option) any later version.
>> + *
>> + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
>> + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
>> + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
>> + * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
>> + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
>> + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
>> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
>> + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>> + *
>> + * You should have received a copy of the  GNU General Public License along
>> + * with this program; if not, write  to the Free Software Foundation, Inc.,
>> + * 675 Mass Ave, Cambridge, MA 02139, USA.
>> + */
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +
>> +#include 
>> +#include 
>> +#include 
>> +#include "omap-timer.h"
>> +/*
>> + *  TODO: OMAP1 support removed due to need for header mach/hardware.h
>> + *OMAP2 support may be broken due to lack of cpu_is stuff, see
>> omap_dm_timer_get_errata
>> + */
>> +
>> +/**
>> + * omap_dm_timer_get_errata - get errata flags for a timer
>> + *
>> + * Get the timer errata flags that are specific to the OMAP device being
>> used.
>> + */
>> +static u32 __init omap_dm_timer_get_errata(void)
>> +{
>> +/* ifdef'd out due to lack of availaibility of soc.h */
>> +#if 0
>> +if (cpu_is_omap24xx())
>> +return 0;
> 
> You should be able to fix this using some compatible checks.

Thanks. I'll use of_device_is_compatible to check for that.

-Joel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V2 06/13] ACPI: introduce enumerable_id flag

2014-03-14 Thread Rafael J. Wysocki


On 3/13/2014 5:16 PM, Zhang Rui wrote:

Only certain kind of ACPI device objects can be enumerated via ACPI.
These ACPI device objects include
1. ACPI device objects that have _HID control method.
2. some ACPI device objects that have Linux specified HID strings.

In order to distinguish those device objects from the others, a new flag
enumerable_id and a new function acpi_add_eid() are introduced in this patch.


I don't really like the name of the new flag.  What about calling it 
platform_id (it is supposed to indicate that the core should create a 
platform device for it)?



Currently, only devices with _HID method have this flag set.
And in the future, if a device that has Linux specified HID strings
wants to be enumerated to platform bus, acpi_add_eid() should be used


And what about calling the new function acpi_add_platform_id() accordingly?


instead of acpi_add_id() when adding its Linux specified HID string.


And I don't quite understand the last paragraph as a whole.  Is it 
supposed to mean "if you want platform devices to be created for device 
objects without _HID, use acpi_add_platform_id() when adding artificial 
Linux-specific ID strings to them"?



Signed-off-by: Zhang Rui 
---
  drivers/acpi/scan.c |8 +++-
  include/acpi/acpi_bus.h |3 ++-
  2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 399257e..768f81d 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1679,6 +1679,12 @@ static void acpi_add_id(struct acpi_device_pnp *pnp, 
const char *dev_id)
pnp->type.hardware_id = 1;
  }
  
+static void acpi_add_eid(struct acpi_device_pnp *pnp, const char *dev_id)

+{
+   acpi_add_id(pnp, dev_id);
+   pnp->type.enumerable_id = 1;
+}
+
  /*
   * Old IBM workstations have a DSDT bug wherein the SMBus object
   * lacks the SMBUS01 HID and the methods do not have the necessary "_"
@@ -1729,7 +1735,7 @@ static void acpi_set_pnp_ids(acpi_handle handle, struct 
acpi_device_pnp *pnp,
}
  
  		if (info->valid & ACPI_VALID_HID)

-   acpi_add_id(pnp, info->hardware_id.string);
+   acpi_add_eid(pnp, info->hardware_id.string);
if (info->valid & ACPI_VALID_CID) {
cid_list = >compatible_id_list;
for (i = 0; i < cid_list->count; i++)
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 8c5e235..688ca44 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -217,7 +217,8 @@ struct acpi_hardware_id {
  struct acpi_pnp_type {
u32 hardware_id:1;
u32 bus_address:1;
-   u32 reserved:30;
+   u32 enumerable_id:1;
+   u32 reserved:29;
  };
  
  struct acpi_device_pnp {


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [v3.13][v3.14][Regression] kthread: make kthread_create() killable

2014-03-14 Thread Tetsuo Handa

Joseph Salisbury wrote:
> A kernel bug report was opened against Ubuntu[0].  We performed a kernel
> bisect, and found that reverting the following commit resolved this bug:

I added a comment to that bug report.

This commit by chance revealed incorrect error handling of mptsas_probe() or
mptscsih_remove().
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] PCI: Do not enable INTx in pci_reenable_device()

2014-03-14 Thread Yinghai Lu

On Tue, Mar 11, 2014 at 10:48 AM, Bjorn Helgaas  wrote:
> Andreas reported that after 1f42db786b14 ("PCI: Enable INTx if BIOS left
> them disabled"), pciehp surprise removal stopped working.
>
> This happens because pci_reenable_device() on the hotplug bridge (used in
> the pciehp_configure_device() path) clears the Interrupt Disable bit, which
> apparently breaks the bridge's MSI hotplug event reporting.
>
> Previously we cleared the Interrupt Disable bit in do_pci_enable_device(),
> which is used by both pci_enable_device() and pci_reenable_device().  But
> we use pci_reenable_device() after the driver may have enabled MSI or
> MSI-X, and we *set* Interrupt Disable as part of enabling MSI/MSI-X.
>
> This patch clears Interrupt Disable only when MSI/MSI-X has not been
> enabled.
>
> Fixes: 1f42db786b14 PCI: Enable INTx if BIOS left them disabled
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=71691
> Reported-and-tested-by: Andreas Noever 
> Signed-off-by: Bjorn Helgaas 
> CC: sta...@vger.kernel.org
> CC: Sarah Sharp 
> ---
>  drivers/pci/pci.c |3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 8dc3e701ec57..79fc89c6c3f3 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -1192,6 +1192,9 @@ static int do_pci_enable_device(struct pci_dev *dev, 
> int bars)
> return err;
> pci_fixup_device(pci_fixup_enable, dev);
>
> +   if (dev->msi_enabled || dev->msix_enabled)
> +   return 0;
> +
> pci_read_config_byte(dev, PCI_INTERRUPT_PIN, );
> if (pin) {
> pci_read_config_word(dev, PCI_COMMAND, );

looks ugly.

We really should move out those irq handling out of pci_enable_device.

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Trusted kernel patchset for Secure Boot lockdown

2014-03-14 Thread One Thousand Gnomes

> So as far as the narrow question of whether we should accept these
> patches, I think it's a good thing.  Personally, I'm always going to
> be disabling UEFI secure boot (even if it doesn't brick my laptop),
> because for me, the security guarantees it provides isn't worth it.
> But there will be people who want to be able to install Linux on
> Windows 8 certified PC's without tweaking BIOS settings, so merging
> the UEFI secure boot is a good thing, so long as those of use who
> don't want to have anything to do with UEFI secure boot can disable
> it.

I definitely think we want the feature and there are a lot of non UEFI
reasons for this (eg running trusted_kernel() virtual namespaces). I have
three specific issues

1. The implementation is a mess in part because it propogates more policy
all over the place that should be separated. Root cause capable() mixes
policy and activity. Fix suggested in my previous emails (and offer to do
the work)

2. It's likely to lead to more bugs and errors because of the way it has
been done and it doesn't break old code that gets added without
considering the issue. It fails insecure which is bad. Fixed by doing
what I suggested (and offered to do)

3. For things like module options we should be white not blacklisting
'bad' ones.

I've offered to go and fix up the capability stuff - I'm just waiting for
Matthew to actually confirm the question I specifically asked him - does
this solve that bit of his problem. If it does great, I'll go and sort
the capability bits out so we can keep the policy in the right place and
we don't have the kernel festooned with && !trusted_kernel() everywhere.

There is a question of completeness but its very clear we get there with
a combination of two things - whitelisting so we catch stuff we missed
rather than leave holes, and just accepting the reality that it'll take a
few kernels once its upstream until we get them all.

I care about security, we should do the job properly. We have a
further magnitude shift in security needs coming that's going to be at
least equivalent to the scale of shift between the old 'university,
everyone is nice' internet and today.

Alan
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

btrfs: lock inversion between delayed_node->mutex and found->groups_sem

2014-03-14 Thread Sasha Levin


Hi all,

While fuzzing with trinity inside a KVM tools guest running the latest -next
kernel I've stumbled on the following:

[  788.451695] =
[  788.452455] [ INFO: possible irq lock inversion dependency detected ]
[  788.453020] 3.14.0-rc6-next-20140313-sasha-00010-gb8c1db1-dirty #217 
Tainted: GW
[  788.453827] -
[  788.454371] kswapd3/4199 just changed the state of lock:
[  788.454902]  (_node->mutex){+.+.-.}, at: 
__btrfs_release_delayed_node+0x4f/0x140 (fs/btrfs/delayed-inode.c:263)
[  788.455890] but this lock took another, RECLAIM_FS-unsafe lock in the past:
[  788.456543]  (>groups_sem){+.}

and interrupts could create inverse lock ordering between them.

[  788.457491]
[  788.457491] other info that might help us debug this:
[  788.458115]  Possible interrupt unsafe locking scenario:
[  788.458115]
[  788.458756]CPU0CPU1
[  788.459188]
[  788.459625]   lock(>groups_sem);
[  788.460041]local_irq_disable();
[  788.460041]lock(_node->mutex);
[  788.460041]lock(>groups_sem);
[  788.460041]   
[  788.460041] lock(_node->mutex);
[  788.460041]
[  788.460041]  *** DEADLOCK ***
[  788.460041]
[  788.460041] 2 locks held by kswapd3/4199:
[  788.460041]  #0:  (shrinker_rwsem){..}, at: shrink_slab+0x3f/0x160 
(mm/vmscan.c:360)
[  788.460041]  #1:  (>s_umount_key#108){.+.+..}, at: 
grab_super_passive+0x56/0x90 (fs/super.c:361)
[  788.460041]
[  788.460041] the shortest dependencies between 2nd lock and 1st lock:
[  788.460041]  -> (>groups_sem){+.} ops: 46 {
[  788.460041] HARDIRQ-ON-W at:
[  788.460041]   mark_irqflags+0xf0/0x170 
(kernel/locking/lockdep.c:2800)
[  788.460041]   __lock_acquire+0x2de/0x5a0 
(kernel/locking/lockdep.c:3138)
[  788.460041]   lock_acquire+0x182/0x1d0 
(arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602)
[  788.460041]   down_write+0x5c/0xc0 
(arch/x86/include/asm/rwsem.h:130 kernel/locking/rwsem.c:50)
[  788.460041]   __link_block_group+0x45/0x110 
(fs/btrfs/extent-tree.c:8348)
[  788.460041]   btrfs_read_block_groups+0x3ae/0x700 
(fs/btrfs/extent-tree.c:8533)
[  788.460041]   open_ctree+0x1abf/0x2210 
(fs/btrfs/disk-io.c:2749)
[  788.460041]   btrfs_fill_super+0x81/0x140 
(fs/btrfs/super.c:958)
[  788.460041]   btrfs_mount+0x26a/0x300 
(fs/btrfs/super.c:1295)
[  788.460041]   mount_fs+0x8d/0x1a0 (fs/super.c:1091)
[  788.460041]   vfs_kern_mount+0x79/0x150 
(fs/namespace.c:813)
[  788.460041]   do_new_mount+0xcd/0x1c0 
(fs/namespace.c:2068)[  788.460041]   do_mount+0x15d/0x210 
(fs/namespace.c:2392)
[  788.460041]   SyS_mount+0x9d/0xe0 (fs/namespace.c:2589 
fs/namespace.c:2560)
[  788.460041]   tracesys+0xdd/0xe2 
(arch/x86/kernel/entry_64.S:749)
[  788.460041] HARDIRQ-ON-R at:
[  788.460041]   mark_irqflags+0xbc/0x170 
(kernel/locking/lockdep.c:2792)
[  788.460041]   __lock_acquire+0x2de/0x5a0 
(kernel/locking/lockdep.c:3138)
[  788.460041]   lock_acquire+0x182/0x1d0 
(arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602)
[  788.460041]   down_read+0x4c/0xa0 
(arch/x86/include/asm/rwsem.h:83 kernel/locking/rwsem.c:23)
[  788.460041]   
btrfs_calc_num_tolerated_disk_barrier_failures+0x2a7/0x3a0 
(fs/btrfs/disk-io.c:3309)
[  788.460041]   open_ctree+0x1af7/0x2210 
(fs/btrfs/disk-io.c:2755)
[  788.460041]   btrfs_fill_super+0x81/0x140 
(fs/btrfs/super.c:958)
[  788.460041]   btrfs_mount+0x26a/0x300 
(fs/btrfs/super.c:1295)
[  788.460041]   mount_fs+0x8d/0x1a0 (fs/super.c:1091)
[  788.460041]   vfs_kern_mount+0x79/0x150 
(fs/namespace.c:813)
[  788.460041]   do_new_mount+0xcd/0x1c0 
(fs/namespace.c:2068)
[  788.460041]   do_mount+0x15d/0x210 (fs/namespace.c:2392)
[  788.460041]   SyS_mount+0x9d/0xe0 (fs/namespace.c:2589 
fs/namespace.c:2560)
[  788.460041]   tracesys+0xdd/0xe2 
(arch/x86/kernel/entry_64.S:749)
[  788.460041] SOFTIRQ-ON-W at:
[  788.460041]   mark_irqflags+0x110/0x170 
(kernel/locking/lockdep.c:2804)
[  788.460041]   __lock_acquire+0x2de/0x5a0 
(kernel/locking/lockdep.c:3138)
[  788.460041]   lock_acquire+0x182/0x1d0 
(arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602)
[  788.460041]

Re: [RFC 4/5] clocksource: omap-timer: Introduce clocksource driver for OMAP SoCs

2014-03-14 Thread Suman Anna


Hi Joel,

On 03/13/2014 03:35 PM, Joel Fernandes wrote:

We introduce functions to initialize clocksource and clockevent, use
CLOCKSOURCE_OF_DECLARE to declare the clocksource, and handle the clocksource
selection on a per-SoC basis (Currently only AM335x is supported). Powering up
of the timer will be done with the help of the mach-omap layer function that's
introduced earlier in the series.

We make a local copy of dmtimer API for use by clocksource, the original
dmtimer API in plat-omap is kept as-is till the migration of all SoCs is
completed after which it can't be deleted.

Signed-off-by: Joel Fernandes 
---
  drivers/clocksource/Makefile |1 +
  drivers/clocksource/omap-timer.c | 1157 ++
  drivers/clocksource/omap-timer.h |  422 ++
  3 files changed, 1580 insertions(+)
  create mode 100644 drivers/clocksource/omap-timer.c
  create mode 100644 drivers/clocksource/omap-timer.h

diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index c7ca50a..2ffe698 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -37,3 +37,4 @@ obj-$(CONFIG_ARM_ARCH_TIMER)  += arm_arch_timer.o
  obj-$(CONFIG_ARM_GLOBAL_TIMER)+= arm_global_timer.o
  obj-$(CONFIG_CLKSRC_METAG_GENERIC)+= metag_generic.o
  obj-$(CONFIG_ARCH_HAS_TICK_BROADCAST) += dummy_timer.o
+obj-y  += omap-timer.o
diff --git a/drivers/clocksource/omap-timer.c b/drivers/clocksource/omap-timer.c
new file mode 100644
index 000..91593d8
--- /dev/null
+++ b/drivers/clocksource/omap-timer.c
@@ -0,0 +1,1157 @@
+/*
+ * drivers/clocksource/omap-timer.c
+ *
+ * OMAP Dual-Mode Timers
+ *
+ * Copyright (C) 2014 Texas Instruments Incorporated - http://www.ti.com/
+ * Joel Fernandes 
+ * Tarun Kanti DebBarma 
+ * Thara Gopinath 
+ *
+ * dmtimer adaptation to platform_driver.
+ *
+ * Copyright (C) 2005 Nokia Corporation
+ * OMAP2 support by Juha Yrjola
+ * API improvements and OMAP2 clock framework support by Timo Teras
+ *
+ * Copyright (C) 2014 Texas Instruments
+ * Added OMAP4 support - Santosh Shilimkar 
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
+ * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You should have received a copy of the  GNU General Public License along
+ * with this program; if not, write  to the Free Software Foundation, Inc.,
+ * 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include "omap-timer.h"
+/*
+ *  TODO: OMAP1 support removed due to need for header mach/hardware.h
+ *OMAP2 support may be broken due to lack of cpu_is stuff, see 
omap_dm_timer_get_errata
+ */
+
+/**
+ * omap_dm_timer_get_errata - get errata flags for a timer
+ *
+ * Get the timer errata flags that are specific to the OMAP device being used.
+ */
+static u32 __init omap_dm_timer_get_errata(void)
+{
+   /* ifdef'd out due to lack of availaibility of soc.h */
+#if 0
+   if (cpu_is_omap24xx())
+   return 0;


You should be able to fix this using some compatible checks.

regards
Suman


+#endif
+   return OMAP_TIMER_ERRATA_I103_I767;
+}
+
+


-snip-

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC PATCH] Support map_pages() for DAX

2014-03-14 Thread Toshi Kani

On Sat, 2014-03-15 at 01:32 +0200, Kirill A. Shutemov wrote:
> On Fri, Mar 14, 2014 at 05:03:19PM -0600, Toshi Kani wrote:
> > +void dax_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf,
> > +   get_block_t get_block)
> > +{
> > +   struct file *file = vma->vm_file;
> > +   struct inode *inode = file_inode(file);
> > +   struct buffer_head bh;
> > +   struct address_space *mapping = file->f_mapping;
> > +   unsigned long vaddr = (unsigned long)vmf->virtual_address;
> > +   pgoff_t pgoff = vmf->pgoff;
> > +   sector_t block;
> > +   pgoff_t size;
> > +   unsigned long pfn;
> > +   pte_t *pte = vmf->pte;
> > +   int error;
> > +
> > +   while (pgoff < vmf->max_pgoff) {
> > +   size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> > +   if (pgoff >= size)
> > +   return;
> > +
> > +   memset(, 0, sizeof(bh));
> > +   block = (sector_t)pgoff << (PAGE_SHIFT - inode->i_blkbits);
> > +   bh.b_size = PAGE_SIZE;
> > +   error = get_block(inode, block, , 0);
> > +   if (error || bh.b_size < PAGE_SIZE)
> > +   goto next;
> > +
> > +   if (!buffer_mapped() || buffer_unwritten() ||
> > +   buffer_new())
> > +   goto next;
> > +
> > +   /* Recheck i_size under i_mmap_mutex */
> > +   mutex_lock(>i_mmap_mutex);
> 
> NAK. Have you tested this with lockdep enabled?
>
> ->map_pages() called with page table lock taken and ->i_mmap_mutex
> should be taken before it. It seems we need to take ->i_mmap_mutex in
> do_read_fault() before calling ->map_pages().

Thanks for pointing this out! I will make sure to test with lockdep next
time.

> Side note: I'm sceptical about whole idea to use i_mmap_mutux to protect
> against truncate. It will not scale good enough comparing lock_page()
> with its granularity.

I see.  I will think about it as well.  

Thanks,
-Toshi



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: build failure after merge of the driver-core tree

2014-03-14 Thread Greg KH

On Fri, Mar 14, 2014 at 09:14:55AM +1100, Benjamin Herrenschmidt wrote:
> On Thu, 2014-03-13 at 11:37 +1100, Benjamin Herrenschmidt wrote:
> > On Wed, 2014-03-12 at 16:21 -0400, Tejun Heo wrote:
> > > It's a series of rather complex patches.  I really don't think
> > > duplicating them is a good idea.  We can either resurrect the old API
> > > to kill it again or set up a merge branch which I don't think is too
> > > unusual in situations like this.
> > 
> > Right, a topic branch that gets merged in both driver-core-next and
> > powerpc-next.
> 
> Just want to make sure we agree ... ie, the offending commit is already
> in powerpc-next on my side and I can't really back it out (I could
> revert it though).

You can pull in driver-core-next into your tree if you want, it's not
going to be reverted, and will be sent to Linus for 3.15-rc1, so you can
base your work on it and fix up the api usage in your tree that way.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] [RFC] perf: Fix a race between ring_buffer_detach() and ring_buffer_wakeup()

2014-03-14 Thread Paul E. McKenney

On Fri, Mar 14, 2014 at 04:02:31PM -0700, Paul E. McKenney wrote:
> On Fri, Mar 14, 2014 at 11:43:17PM +0100, Peter Zijlstra wrote:
> > On Fri, Mar 14, 2014 at 01:47:37PM -0700, Paul E. McKenney wrote:
> > > This general idea can be made to work, but it will need some
> > > internal-to-RCU help.  One vulnerability of the patch below is the
> > > following sequence of steps:
> > > 
> > > 1.RCU has just finished a grace period, and is doing the
> > >   end-of-grace-period accounting.
> > > 
> > > 2.The code below invokes rcu_batches_completed().  Let's assume
> > >   the result returned is 42.
> > > 
> > > 3.RCU completes the end-of-grace-period accounting, and increments
> > >   rcu_sched_state.completed.
> > > 
> > > 4.The code below checks ->rcu_batches against the result from
> > >   another invocation of rcu_batches_completed() and sees that
> > >   the 43 is not equal to 42, so skips the synchronize_rcu().
> > > 
> > > Except that a grace period has not actually completed.  Boom!!!
> > > 
> > > The problem is that rcu_batches_completed() is only intended to give
> > > progress information on RCU.
> > 
> > Ah, I thought I was missing something when I was looking through the rcu
> > code in a hurry :-)
> 
> Well, given that I sometimes miss things when looking through RCU code
> carefuly, I guess I cannot give you too much trouble about it.
> 
> > I knew there'd be some subtlety between completed and gpnum and such :-)
> 
> Some of which I have learned about one RCU bug at a time.  ;-)
> 
> > > What I can do is give you a pair of functions, one to take a snapshot of
> > > the current grace-period state (returning an unsigned long) and another
> > > to evaluate a previous snapshot, invoking synchronize_rcu() if there has
> > > not been a full grace period in the meantime.
> > > 
> > > The most straightforward approach would invoke acquiring the global
> > > rcu_state ->lock on each call, which I am guessing just might be
> > > considered to be excessive overhead.  ;-)  I should be able to decrease
> > > the overhead to a memory barrier on each call, and perhaps even down
> > > to an smp_load_acquire().  Accessing the RCU state probably costs you
> > > a cache miss both times.
> > > 
> > > Thoughts?
> > 
> > Sounds fine, the attach isn't a hotpath, so even the locked version
> > should be fine, but I won't keep you from making it all fancy and such
> > :-)
> 
> Fair enough, let me see what I can come up with.

And here is an untested patch.  Thoughts?

(And yes, I need to update documentation and torture tests accordingly.)

Thanx, Paul



rcu: Provide grace-period piggybacking API

The following pattern is currently not well supported by RCU:

1.  Make data element inaccessible to RCU readers.

2.  Do work that probably lasts for more than one grace period.

3.  Do something to make sure RCU readers in flight before #1 above
have completed.

Here are some things that could currently be done:

a.  Do a synchronize_rcu() unconditionally at either #1 or #3 above.
This works, but imposes needless work and latency.

b.  Post an RCU callback at #1 above that does a wakeup, then
wait for the wakeup at #3.  This works well, but likely results
in an extra unneeded grace period.  Open-coding this is also
a bit more semi-tricky code than would be good.

This commit therefore adds get_state_synchronize_rcu() and
cond_synchronize_rcu() APIs.  Call get_state_synchronize_rcu() at #1
above and pass its return value to cond_synchronize_rcu() at #3 above.
This results in a call to synchronize_rcu() if no grace period has
elapsed between #1 and #3, but requires only a load, comparison, and
memory barrier if a full grace period did elapse.

Reported-by: Peter Zijlstra 
Signed-off-by: Paul E. McKenney 

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c9be2235712c..dbf0f225bca0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1503,13 +1503,14 @@ static int rcu_gp_init(struct rcu_state *rsp)
 
/* Advance to a new grace period and initialize state. */
record_gp_stall_check_time(rsp);
-   smp_wmb(); /* Record GP times before starting GP. */
-   rsp->gpnum++;
+   /* Record GP times before starting GP, hence smp_store_release(). */
+   smp_store_release(>gpnum, rsp->gpnum + 1);
trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
raw_spin_unlock_irq(>lock);
 
/* Exclude any concurrent CPU-hotplug operations. */
mutex_lock(>onoff_mutex);
+   smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
 
/*
 * Set the quiescent-state-needed bits in all the rcu_node
@@ -1638,10 +1639,11 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
}
rnp =

Re: [PATCHv4 0/7] omap hwspinlock dt support

2014-03-14 Thread Suman Anna


Hi Ohad,

On 03/14/2014 03:10 PM, Ohad Ben-Cohen wrote:

Hi Suman, Mark,

On Mon, Feb 24, 2014 at 8:14 PM, Suman Anna  wrote:

Mark, Ohad,

...

Gentle reminder, can you provide your acks/comments?


Sorry for the late jump in.

I have a few comments:


Thanks for the comments. It probably covers few topics that are slightly 
beyond the scope of the series, but nevertheless are good discussion 
points for finalizing the series.



- Hardware spinlocks must have global and system-wide id numbers;
these numbers cannot be maintained internally by the Linux Kernel.
Think of an SoC with several asynchronous heterogeneous processors,
each of which is running a different OS, and they all need to use a
specific hardware spinlock in order to share some resource. For that
to happen, every hwlock must have a predefined and deterministic id
number which is global in the system. We can't have those id numbers
be relative to an hwlock "controller", and we can't have two hwlock
"controllers" share the same id numbers.


The series doesn't change the semantics of hwspinlock registration or 
adds a new OF controller registration function. Implementations would 
still need to register a controller using a base_id and number of locks. 
The series rather adds a DT-friendly function _ONLY_ for requesting a 
specific hwlock, and there are no restrictions on the args specifier 
being relative id numbers. Though this is what the simple default xlate 
helper does (most common usage), the added xlate ops and #hwlock-cells 
should allow individual implementation drivers to adjust any variations, 
and return a relative lock w.r.t its registered base_id, as this is how 
a lock gets registered in the first place.




- I suspect the simplest and most straight forward way to achieve this
is by (a) bringing back the concept of the base_id property, and


I actually started out this series with the base_id property, and 
dropped it in v3 based on comments looking at it from the 
request-specific-lock semantics with DT. That said, the drivers still 
need to manage a 'base_id' needed for registration when they get probed 
for multiple controllers. Getting the base_id from DT _may_ be useful 
just for registration purposes, but for requesting a hwlock, a 
controller phandle and an implementation defined args-specifier should 
suffice IMHO.



(b)

letting the global hwlock id be the DT identifier (plus the base_id)
and then providing it directly to the drivers when needed.The latter
is required in order to support dynamically allocation of hwlocks, in
which case Linux must know the global system-wide id number, and then
share it with the other asynchronous OSes via some IPC.


Each lock still retains a global lock id value, and you can retrieve it 
using the existing hwspin_lock_get_id(). Why is the latter required for 
dynamic allocation, isn't it the other way around, allocate a lock, and 
you will be able to get the lock id. If wanting to request a specific 
lock received across, the regular hwspin_lock_request_specific should be 
used.




- If we feel there's no way any system is going to have more than a
single hwlock controller, then we can live without a base_id property,
but then this needs to be clearly documented and prohibited. Today
both the hwlock DT representation, and the coupled kernel code,
implicitly allow this anomaly to exist.


I haven't removed the concept of base_id, it is just not defined in the 
DT-bindings, and am currently expecting the drivers to manage it and use 
it for registration.




- Hwlock controller nodes should have a list of reserved locks (those
locks for which other nodes have a phandle+identifier pointing at) to
prevent those locks from being dynamically allocated by eager drivers.


The exact notion of informing the hwspinlock core about a list of 
reserved locks is missing at the moment (even in the non-DT case). I am 
not sure if this got lost during the conversion of the registration from 
per lock to registering a bank of locks together, or if it is implied by 
the base_id + num_locks combination. The core today supports requesting 
only those locks that were actually registered, whether allocating a 
free one dynamically or giving a specific one.


There were some slightly similar comments from Kumar earlier on the v2 
series, please see the thread in [1].




Most of these issues were discussed Arnd, Benoit and myself back then,
please see below:
http://lists.infradead.org/pipermail/linux-arm-kernel/2011-September/064265.html


Thanks for the pointer to the previous discussion, I wasn't aware of an 
earlier attempt. The primary approach on requesting locks is actually no 
different from what Arnd suggested originally.


regards
Suman

[1] http://marc.info/?l=linux-omap=138031002012191=2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please

Re: [PATCH 1/3] mm: munlock: fix a bug where THP tail page is encountered

2014-03-14 Thread Sasha Levin


On 12/17/2013 08:00 AM, Vlastimil Babka wrote:

From: Vlastimil Babka
Date: Fri, 13 Dec 2013 14:25:21 +0100
Subject: [PATCH 1/3] mm: munlock: fix a bug where THP tail page is encountered

Since commit ff6a6da60 ("mm: accelerate munlock() treatment of THP pages")
munlock skips tail pages of a munlocked THP page. However, when the head page
already has PageMlocked unset, it will not skip the tail pages.

Commit 7225522bb ("mm: munlock: batch non-THP page isolation and
munlock+putback using pagevec") has added a PageTransHuge() check which
contains VM_BUG_ON(PageTail(page)). Sasha Levin found this triggered using
trinity, on the first tail page of a THP page without PageMlocked flag.

This patch fixes the issue by skipping tail pages also in the case when
PageMlocked flag is unset. There is still a possibility of race with THP page
split between clearing PageMlocked and determining how many pages to skip.
The race might result in former tail pages not being skipped, which is however
no longer a bug, as during the skip the PageTail flags are cleared.

However this race also affects correctness of NR_MLOCK accounting, which is to
be fixed in a separate patch.


I've hit the same thing again, on the latest -next, this time with a different 
trace:

[  539.199120] page:ea0013249a80 count:0 mapcount:1 mapping:  
(null) index:0x0
[  539.200429] page flags: 0x12f80008000(tail)
[  539.201167] [ cut here ]
[  539.201889] kernel BUG at include/linux/page-flags.h:415!
[  539.202859] invalid opcode:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
[  539.204588] Dumping ftrace buffer:
[  539.206415](ftrace buffer empty)
[  539.207022] Modules linked in:
[  539.207503] CPU: 3 PID: 18262 Comm: trinity-c228 Tainted: GW 
3.14.0-rc6-next-20140313-sasha-00010-gb8c1db1-dirty #217
[  539.209012] task: 880627b1 ti: 8805a44c2000 task.ti: 
8805a44c2000
[  539.209989] RIP:  munlock_vma_pages_range+0x93/0x1d0 
(include/linux/page-flags.h:415 mm/mlock.c:494)
[  539.210263] RSP: :8805a44c3e08  EFLAGS: 00010246
[  539.210263] RAX: 88052ae126a0 RBX: 0006a000 RCX: 0099
[  539.210263] RDX:  RSI: 880627b10cf0 RDI: 04c926a0
[  539.210263] RBP: 8805a44c3ec8 R08: 0001 R09: 0001
[  539.210263] R10: 0001 R11: 0001 R12: ea0013249a80
[  539.210263] R13: 88039dc95a00 R14: 0006b000 R15: 8805a44c3e94
[  539.210263] FS:  7fd6ce14a700() GS:88042b80() 
knlGS:
[  539.210263] CS:  0010 DS:  ES:  CR0: 8005003b
[  539.210263] CR2: 7fd6ce0ef6ac CR3: 0006025cd000 CR4: 06a0
[  539.210263] DR0: 00698000 DR1:  DR2: 
[  539.210263] DR3:  DR6: 0ff0 DR7: 0600
[  539.210263] Stack:
[  539.210263]    00018805a44c3e38 

[  539.210263]   88039dc95a00 a44c3e88 

[  539.210263]  00ff8805a44c3e58 880528f0a0f0 8805a44c3eb8 
88039dc95a00
[  539.210263] Call Trace:
[  539.210263]  do_munmap+0x1d2/0x360 (mm/internal.h:168 mm/mmap.c:2547)
[  539.210263]  ? down_write+0xa6/0xc0 (kernel/locking/rwsem.c:51)
[  539.210263]  ? vm_munmap+0x46/0x80 (mm/mmap.c:2571)
[  539.210263]  vm_munmap+0x54/0x80 (mm/mmap.c:2572)
[  539.210263]  SyS_munmap+0x2c/0x40 (mm/mmap.c:2577)
[  539.210263]  tracesys+0xdd/0xe2 (arch/x86/kernel/entry_64.S:749)
[  539.210263] Code: ff 49 89 c4 48 85 c0 0f 84 f3 00 00 00 48 3d 00 f0 ff ff 0f 87 
e7 00 00 00 48 8b 00 66 85 c0 79 17 31 f6 4c 89 e7 e8 4d d2 fc ff <0f> 0b 0f 1f 
00 eb fe 66 0f 1f 44 00 00 49 8b 04 24 f6 c4 40 74
[  539.210263] RIP  munlock_vma_pages_range+0x93/0x1d0 
(include/linux/page-flags.h:415 mm/mlock.c:494)
[  539.210263]  RSP 
[  539.23] ---[ end trace 4e90dc9141579181 ]---


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] arm/xen: Don't use xen DMA ops when the device is protected by an IOMMU

2014-03-14 Thread gre...@linuxfoundation.org

On Fri, Mar 14, 2014 at 04:50:23PM +, Julien Grall wrote:
> On 02/24/2014 08:49 PM, Stefano Stabellini wrote:
> > On Mon, 24 Feb 2014, gre...@linuxfoundation.org wrote: 
> > Julien is proposing to store the list of "safe" devices on an hash table
> > in the Xen specific code (in arch/arm/xen/enlighten.c, see
> > http://marc.info/?l=linux-kernel=139291370526082=2).
> > Whenever Linux is about to do DMA, we would check in the hashtable to
> > figure out whether we need to go through the swiotlb or we can simply
> > use the native dma_ops.
> > 
> > Ian and I were thinking that it would be much easier and faster to have
> > a "xen_safe_device" parameter in struct device and just check for that.
> > It doesn't actually need to be in struct device, it could simply be a
> > flag in struct device_dma_parameters as Ian was suggesting.
> > 
> > Julien, could you please come up with a simple patch to demonstrate the
> > concept?
> 
> Hello Stefano and Greg,
> 
> Sorry for the late answer. I wrote a simple patch which depend on patch #1.
> Let me know if it's the right direction.

I have no context here, care to start the patch series over?

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] net: Implement SO_PEERCGROUP

2014-03-14 Thread Eric W. Biederman

Vivek Goyal  writes:

> On Wed, Mar 12, 2014 at 07:12:25PM -0700, Andy Lutomirski wrote:
>
>> I can think of at least three other ways to do this.
>> 
>> 1. Fix Docker to use user namespaces and use the uid of the requesting
>> process via SCM_CREDENTIALS.
>
> Using user namespaces sounds like the right way to do it (atleast
> conceptually). But I think hurdle here is that people are not convinced
> yet that user namespaces are secure and work well. IOW, some people
> don't seem to think that user namespaces are ready yet.

If the problem is user namespace immaturity patches or bug reports need
to be sent for user namespaces.

Containers with user namespaces (however immature they are) are much
more secure than running container with processes with uid == 0 inside
of them.  User namespaces do considerably reduce the attack surface of
what uid == 0 can do.

> I guess that's the reason people are looking for other ways to
> achieve their goal.

It seems strange to work around a feature that is 99% of the way to
solving their problem with more kernel patches.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] net: Implement SO_PEERCGROUP

2014-03-14 Thread Eric W. Biederman

Vivek Goyal  writes:

> On Thu, Mar 13, 2014 at 12:58:14PM -0700, Andy Lutomirski wrote:
>> On Thu, Mar 13, 2014 at 12:53 PM, Vivek Goyal  wrote:
>> > On Thu, Mar 13, 2014 at 10:55:16AM -0700, Andy Lutomirski wrote:
>> >
>> > [..]
>> >> >> 2. Docker is a container system, so use the "container" (aka
>> >> >> namespace) APIs.  There are probably several clever things that could
>> >> >> be done with /proc//ns.
>> >> >
>> >> > pid is racy, if it weren't I would simply go straight
>> >> > to /proc//cgroups ...
>> >>
>> >> How about:
>> >>
>> >> open("/proc/self/ns/ipc", O_RDONLY);
>> >> send the result over SCM_RIGHTS?
>> >
>> > As I don't know I will ask. So what will server now do with this file
>> > descriptor of client's ipc namespace.
>> >
>> > IOW, what information/identifier does it contain which can be
>> > used to map to pre-configrued per container/per namespace policies.
>> 
>> Inode number, which will match that assigned to the container at runtime.
>> 
>
> But what would I do with this inode number. I am assuming this is
> generated dynamically when respective namespace was created. To me
> this is like assigning a pid dynamically and one does not create
> policies in user space based on pid. Similarly I will not be able
> to create policies based on an inode number which is generated
> dynamically.
>
> For it to be useful, it should map to something more static which
> user space understands.

But the mapping can be done in userspace.  stat all of the namespaces
you care about, get their inode numbers, and then do a lookup.

Hard coding string based names in the kernel the way cgroups does is
really pretty terrible and it seriously limits the flexibility of the
api, and so far breaks nested containers.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH net-next v3 1/2] r8152:addRTL8152_EARLY_AGG_TIMEOUT_SUPER

2014-03-14 Thread Francois Romieu

hayeswang  :
[...]
> Besides, I don't wish to modify the setting by ethtool when re-loading
> the driver or rebooting every time.

Why ?

The recipe is different but there isn't much setup difference between a
module param and an ethtool command that is run through udev. The latter
is more versatile though.

> Excuse me. Why is it not accepted for being a Kconfig parameter.

You have stated that the optimal value is not easy to figure.

It's thus hard to give much credit to an hardcoded solution.

-- 
Ueimor
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] staging: android: fix coding style

2014-03-14 Thread Greg KH

On Thu, Mar 13, 2014 at 10:44:55AM +0100, mo...@2600fr.org wrote:
> From: mourn 
> 
> Signed-off-by: mourn 

As said in the thread for your other thread, this patch can't be
accepted unless you use your real name, sorry.

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC PATCH] Support map_pages() for DAX

2014-03-14 Thread Kirill A. Shutemov

On Fri, Mar 14, 2014 at 05:03:19PM -0600, Toshi Kani wrote:
> +void dax_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf,
> + get_block_t get_block)
> +{
> + struct file *file = vma->vm_file;
> + struct inode *inode = file_inode(file);
> + struct buffer_head bh;
> + struct address_space *mapping = file->f_mapping;
> + unsigned long vaddr = (unsigned long)vmf->virtual_address;
> + pgoff_t pgoff = vmf->pgoff;
> + sector_t block;
> + pgoff_t size;
> + unsigned long pfn;
> + pte_t *pte = vmf->pte;
> + int error;
> +
> + while (pgoff < vmf->max_pgoff) {
> + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> + if (pgoff >= size)
> + return;
> +
> + memset(, 0, sizeof(bh));
> + block = (sector_t)pgoff << (PAGE_SHIFT - inode->i_blkbits);
> + bh.b_size = PAGE_SIZE;
> + error = get_block(inode, block, , 0);
> + if (error || bh.b_size < PAGE_SIZE)
> + goto next;
> +
> + if (!buffer_mapped() || buffer_unwritten() ||
> + buffer_new())
> + goto next;
> +
> + /* Recheck i_size under i_mmap_mutex */
> + mutex_lock(>i_mmap_mutex);

NAK. Have you tested this with lockdep enabled?

->map_pages() called with page table lock taken and ->i_mmap_mutex
should be taken before it. It seems we need to take ->i_mmap_mutex in
do_read_fault() before calling ->map_pages().

Side note: I'm sceptical about whole idea to use i_mmap_mutux to protect
against truncate. It will not scale good enough comparing lock_page()
with its granularity.

> + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> + if (unlikely(pgoff >= size)) {
> + mutex_unlock(>i_mmap_mutex);
> + return;
> + }
> +
> + error = dax_get_pfn(inode, , );
> + if (error > 0)
> + dax_set_pte(vma, vaddr, pfn, pte);
> +
> + mutex_unlock(>i_mmap_mutex);
> +next:
> + vaddr += PAGE_SIZE;
> + pgoff++;
> + pte++;
> + }
> +}
> +EXPORT_SYMBOL_GPL(dax_map_pages);

-- 
 Kirill A. Shutemov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] staging: android: fix coding style

2014-03-14 Thread Greg KH

On Thu, Mar 13, 2014 at 03:13:04PM +0100, mourn wrote:
> 
> > The changes look good, however you should be a bit more specific about
> > what the
> > fixes are, and that you used checkpatch.pl to find them (I assume).
> >
> 
> Yes, you are right. I can edit the commit log.
> 
> > Also, Signed-off-by: should state your full name, which I'm also assuming
> > isn't
> > 'mourn'. Please see Documentation/SubmittingPatches.
> >
> 
> I have a bit of a problem here. Some companies forbid their employees to
> openly participate in open source projects. A request to the legal
> department will take around 6 months with no hope of positive result...

Then I can not take your patch, sorry, we can not take anonymous
patches, as the documentation that describes what you mean by adding the
"Signed-off-by:" line to a patch.

good luck with your legal department,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Trusted kernel patchset for Secure Boot lockdown

2014-03-14 Thread Theodore Ts'o

On Fri, Mar 14, 2014 at 10:08:40PM +, One Thousand Gnomes wrote:
> > Signed userspace is not a requirement, and therefore any solution that
> > relies on a signed initrd is inadequate. There are use cases that
> > require verification of the initrd and other levels. This isn't one of
> > them.
> 
> The job of the kernel is to solve the general problem. There are lots of
> people who happen to care about verification beyond the kernel so it
> shouldn't be ignored. And they can do do things like load trusted SELinux
> rulesets even if you can't support it in your environment.

This is really a question about goals and trust models.  Alan is
arguing that we should support trust models and goals which go far
beyond the goals and trust model of the UEFI Forum.

Matthew is, I think, trying to make the argument that his patches
fulfill the goals that are needed so we can boot Linux distribution
kernels on UEFI secure boot machines without worrying about Microsoft
deciding to revoke keys so that Red Hat or SuSE will no longer be able
to be bootable on desktops that are certified for Windows 8.  And
while we might want to improve the framework to support other trust
models later on, supporting distro kernels on Windows 8 certified PC's
is important enough that we should let these patches into mainline.

Is that a fair summary of the two viewpoints?

Personally, I think that we are fortunate that Windows 8 has been
enough of a train wreck that huge numbers of users have been taking
Windows 8 systems and upgrading them to Windows 7, and hence the need
for Distro kernels that can boot on fully locked down Windows 8 PC's.
But at some point, whether it is a few years or a decade later (if
Windows 7 lives on as long as XP :-), Windows 7 will be EOL'ed, and
even before that, UEFI secure boot will be enabled by default.

Right now, even though Lenovo laptops are shipping with Windows
8. UEFI secure boot is not made mandatory (although it is on enough to
brick the laptop when it runs into bugs wwith the UEFI BIOS code,
sigh).  But sooner or later, UEFI secure boot will be on by default,
and then if Linux distros don't have kernels where the installer can
be run without needing to twiddle BIOS settings, it might make it
harder for the "Year of the Desktop" to come about.

So as far as the narrow question of whether we should accept these
patches, I think it's a good thing.  Personally, I'm always going to
be disabling UEFI secure boot (even if it doesn't brick my laptop),
because for me, the security guarantees it provides isn't worth it.
But there will be people who want to be able to install Linux on
Windows 8 certified PC's without tweaking BIOS settings, so merging
the UEFI secure boot is a good thing, so long as those of use who
don't want to have anything to do with UEFI secure boot can disable
it.

Regards,

- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 9/9] powerpc/pm: support deep sleep feature on T1040

2014-03-14 Thread Scott Wood

On Wed, 2014-03-12 at 18:40 +0800, Chenhui Zhao wrote:
> On Tue, Mar 11, 2014 at 08:10:24PM -0500, Scott Wood wrote:
> > On Fri, 2014-03-07 at 12:58 +0800, Chenhui Zhao wrote:
> > > From: Zhao Chenhui 
> > > 
> > > T1040 supports deep sleep feature, which can switch off most parts of
> > > the SoC when it is in deep sleep mode. This way, it becomes more
> > > energy-efficient.
> > > 
> > > The DDR controller will also be powered off in deep sleep. Therefore,
> > > the last stage (the latter part of fsl_dp_enter_low) will run without DDR
> > > access. This piece of code and related TLBs will be prefetched.
> > > 
> > > Due to the different initialization code between 32-bit and 64-bit, they
> > > have seperate resume entry and precedure.
> > > 
> > > The feature supports 32-bit and 64-bit kernel mode.
> > > 
> > > Signed-off-by: Zhao Chenhui 
> > > ---
> > >  arch/powerpc/include/asm/booke_save_regs.h |3 +
> > >  arch/powerpc/kernel/cpu_setup_fsl_booke.S  |   17 ++
> > >  arch/powerpc/kernel/head_fsl_booke.S   |   30 +++
> > >  arch/powerpc/platforms/85xx/Makefile   |2 +-
> > >  arch/powerpc/platforms/85xx/deepsleep.c|  201 +++
> > >  arch/powerpc/platforms/85xx/qoriq_pm.c |   38 
> > >  arch/powerpc/platforms/85xx/sleep.S|  295 
> > > 
> > >  arch/powerpc/sysdev/fsl_soc.h  |7 +
> > >  8 files changed, 592 insertions(+), 1 deletions(-)
> > >  create mode 100644 arch/powerpc/platforms/85xx/deepsleep.c
> > >  create mode 100644 arch/powerpc/platforms/85xx/sleep.S
> > > 
> > > diff --git a/arch/powerpc/include/asm/booke_save_regs.h 
> > > b/arch/powerpc/include/asm/booke_save_regs.h
> > > index 87c357a..37c1f6c 100644
> > > --- a/arch/powerpc/include/asm/booke_save_regs.h
> > > +++ b/arch/powerpc/include/asm/booke_save_regs.h
> > > @@ -88,6 +88,9 @@
> > >  #define HIBERNATION_FLAG 1
> > >  #define DEEPSLEEP_FLAG   2
> > >  
> > > +#define CPLD_FLAG1
> > > +#define FPGA_FLAG2
> > 
> > What is this?
> 
> We have two kind of boards, QDS and RDB.
> They have different register map. Use the flag to indicate the current board 
> is using which kind
> of register map.

CPLD versus FPGA is not a meaningful difference.  We don't care what
technology is used to implement programmable logic -- we care what
programming interface is exposed.  Customers will have their own boards
that will likely not imitate either of these programming interfaces, but
what they do have will still probably be implemented in a CPLD or FPGA.
Likewise, Freescale may have future reference boards whose CPLD/FPGA is
not compatible.

So use better naming, and structure the code so it's easy to plug in
implementations for new or custom boards.
 
> > > diff --git a/arch/powerpc/kernel/head_fsl_booke.S 
> > > b/arch/powerpc/kernel/head_fsl_booke.S
> > > index 20204fe..3285752 100644
> > > --- a/arch/powerpc/kernel/head_fsl_booke.S
> > > +++ b/arch/powerpc/kernel/head_fsl_booke.S
> > > @@ -162,6 +162,19 @@ _ENTRY(__early_start)
> > >  #include "fsl_booke_entry_mapping.S"
> > >  #undef ENTRY_MAPPING_BOOT_SETUP
> > >  
> > > +#if defined(CONFIG_SUSPEND) && defined(CONFIG_FSL_CORENET_RCPM)
> > > + /* if deep_sleep_flag != 0, jump to the deep sleep resume entry */
> > > + LOAD_REG_ADDR(r4, deep_sleep_flag)
> > > + lwz r3, 0(r4)
> > > + cmpwi   r3, 0
> > > + beq 11f
> > > + /* clear deep_sleep_flag */
> > > + li  r3, 0
> > > + stw r3, 0(r4)
> > > + b   fsl_deepsleep_resume
> > > +11:
> > > +#endif
> > 
> > Why do you come in via the normal kernel entry, versus specifying a
> > direct entry point for deep sleep resume?  How does U-Boot even know
> > what the normal entry is when resuming?
> 
> I wish to return to a specified point (like 64-bit mode), but the code in
> fsl_booke_entry_mapping.S only can run in the first page. Because it
> only setups a temp mapping of 4KB.

Why do you need the entry mapping on 32-bit but not 64-bit?
> 
> > > +#if defined(CONFIG_SUSPEND) && defined(CONFIG_FSL_CORENET_RCPM)
> > > +_ENTRY(__entry_deep_sleep)
> > > +/*
> > > + * Bootloader will jump to here when resuming from deep sleep.
> > > + * After executing the init code in fsl_booke_entry_mapping.S,
> > > + * will jump to the real resume entry.
> > > + */
> > > + li  r8, 1
> > > + bl  12f
> > > +12:  mflrr9
> > > + addir9, r9, (deep_sleep_flag - 12b)
> > > + stw r8, 0(r9)
> > > + b __early_start
> > > +deep_sleep_flag:
> > > + .long   0
> > > +#endif
> > 
> > It's a bit ambiguous to say "entry_deep_sleep" when it's resuming rather
> > than entering...
> 
> How about __fsl_entry_resume?

fsl_booke_deep_sleep_resume

> > > +#define FSLDELAY(count)  \
> > > + li  r3, (count)@l;  \
> > > + slwir3, r3, 10; \
> > > + mtctr   r3; \
> > > +101: nop;\
> > > + bdnz101b;
> > 
> > You don't need a namespace prefix on local macros in a non-header file.
> > 
> > Is

Re: [RFC PATCH] MAINTAINERS: Add linux.n...@intel.com to INTEL ETHERNET DRIVERS

2014-03-14 Thread Jeff Kirsher

On Fri, 2014-03-14 at 15:43 -0400, David Miller wrote:
> From: Joe Perches 
> Date: Thu, 13 Mar 2014 10:11:45 -0700
> 
> > If this is added to the driver files, then maybe it's
> > appropriate to add to MAINTAINERS as well.
> > 
> > Signed-off-by: Joe Perches 
> 
> Intel folks, this look ok to you?

Yeah, it is fine.  Sorry, I overlooked this because of the RFC.

Acked-by: Jeff Kirsher 


signature.asc
Description: This is a digitally signed message part

[RFC PATCH] Support map_pages() for DAX

2014-03-14 Thread Toshi Kani

DAX provides direct access to NVDIMM and bypasses the page caches.
Newly introduced map_pages() callback reduces page faults by adding
mappings around a faulted page, which is not supported for DAX.

This patch implements map_pages() callback for DAX.  It reduces a
number of page faults and increases read performance of DAX as shown
below.  The values in parenthesis are relative to the base DAX results.

iozone results of mmap read/re-read tests [KB/sec]
 64KB:  read: 3,560,777 (x1.6) re-read: 9,086,412 (x1.8) pfault:   121 (-20%)
 128MB: read: 4,374,906 (x1.7) re-read: 6,137,189 (x2.4) pfault: 8,312 (-87%)

Signed-off-by: Toshi Kani 

Applies on top of DAX patchset [1] and fault-around patchset [2].

[1] https://lkml.org/lkml/2014/2/25/460
[2] https://lkml.org/lkml/2014/2/27/546
---
 fs/dax.c   |   68 
 fs/ext4/file.c |6 +
 include/linux/fs.h |5 
 3 files changed, 79 insertions(+)

diff --git a/fs/dax.c b/fs/dax.c
index c8dfab0..bc54705 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -476,3 +476,71 @@ int dax_zero_page_range(struct inode *inode, loff_t from, 
unsigned length,
return 0;
 }
 EXPORT_SYMBOL_GPL(dax_zero_page_range);
+
+static void dax_set_pte(struct vm_area_struct *vma, unsigned long addr,
+   unsigned long pfn, pte_t *pte)
+{
+   pte_t entry;
+
+   if (addr < vma->vm_start || addr >= vma->vm_end)
+   return;
+
+   if (!pte_none(*pte))
+   return;
+
+   entry = pte_mkspecial(pfn_pte(pfn, vma->vm_page_prot));
+   set_pte_at(vma->vm_mm, addr, pte, entry);
+   update_mmu_cache(vma, addr, pte);
+}
+
+void dax_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf,
+   get_block_t get_block)
+{
+   struct file *file = vma->vm_file;
+   struct inode *inode = file_inode(file);
+   struct buffer_head bh;
+   struct address_space *mapping = file->f_mapping;
+   unsigned long vaddr = (unsigned long)vmf->virtual_address;
+   pgoff_t pgoff = vmf->pgoff;
+   sector_t block;
+   pgoff_t size;
+   unsigned long pfn;
+   pte_t *pte = vmf->pte;
+   int error;
+
+   while (pgoff < vmf->max_pgoff) {
+   size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   if (pgoff >= size)
+   return;
+
+   memset(, 0, sizeof(bh));
+   block = (sector_t)pgoff << (PAGE_SHIFT - inode->i_blkbits);
+   bh.b_size = PAGE_SIZE;
+   error = get_block(inode, block, , 0);
+   if (error || bh.b_size < PAGE_SIZE)
+   goto next;
+
+   if (!buffer_mapped() || buffer_unwritten() ||
+   buffer_new())
+   goto next;
+
+   /* Recheck i_size under i_mmap_mutex */
+   mutex_lock(>i_mmap_mutex);
+   size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   if (unlikely(pgoff >= size)) {
+   mutex_unlock(>i_mmap_mutex);
+   return;
+   }
+
+   error = dax_get_pfn(inode, , );
+   if (error > 0)
+   dax_set_pte(vma, vaddr, pfn, pte);
+
+   mutex_unlock(>i_mmap_mutex);
+next:
+   vaddr += PAGE_SIZE;
+   pgoff++;
+   pte++;
+   }
+}
+EXPORT_SYMBOL_GPL(dax_map_pages);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index eb19383..15965ea 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -205,6 +205,11 @@ static int ext4_dax_fault(struct vm_area_struct *vma, 
struct vm_fault *vmf)
/* Is this the right get_block? */
 }
 
+static void ext4_dax_map_pages(struct vm_area_struct *vma, struct vm_fault 
*vmf)
+{
+   return dax_map_pages(vma, vmf, ext4_get_block);
+}
+
 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
return dax_mkwrite(vma, vmf, ext4_get_block);
@@ -212,6 +217,7 @@ static int ext4_dax_mkwrite(struct vm_area_struct *vma, 
struct vm_fault *vmf)
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault  = ext4_dax_fault,
+   .map_pages  = ext4_dax_map_pages,
.page_mkwrite   = ext4_dax_mkwrite,
.remap_pages= generic_file_remap_pages,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d0381ab..3bd1042 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2527,6 +2527,7 @@ ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, 
const struct iovec *,
loff_t, unsigned segs, get_block_t, dio_iodone_t, int flags);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int dax_mkwrite(struct vm_area_struct *, struct vm_fault *, get_block_t);
+void dax_map_pages(struct vm_area_struct *, struct vm_fault *, get_block_t);
 #else
 static inline int dax_clear_blocks(struct inode

Re: [PATCH] perf/x86/intel: Use rdmsrl_safe when initializing RAPL PMU.

2014-03-14 Thread Venkatesh Srinivas


On Fri, Mar 14, 2014 at 10:57:58AM -0600, David Ahern wrote:

On 3/14/14, 10:17 AM, Andi Kleen wrote:

The Intel ISR section for RDMSR seems to say: "Specifying a reserved
or unimplemented
MSR address in ECX will also cause a general protection exception".

From a guest's perspective, MSR_RAPL_POWER_UNIT is unimplemented; kvm matches
this behavior.


MSRs are model specific and defined per model number. If you report a model
number you're expected to implement the MSRs defined for that model number.

AFAIK Xen just reports 0 for unknown MSRs (and I'm surprised KVM doesn't too)

I would suggest to fix KVM.


I believe ignore_msrs parameter to kvm handles that.

David


Hi,

cc-ing the virtualization mailing list for more detail on the kvm
default for ignore_msrs (it defaults off).

1) Just returning 0 for unsupported MSRs is not workable -- 0 may be a
   meaningful value for an MSR. RDMSR/WRMSR already have a mechanism
   for out-of-band errors, #GP.

2) #GP has been KVM's default behavior for quite some time. Even if we
   believe changing KVM's default is appropriate, Linux w/ the RAPL PMU
   code enabled will fail to boot on existing KVM versions. W/ this
   change, Linux will boot on prior KVM versions.

Thanks,
-- vs;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 01/11] base: power: Add generic OF-based power domain look-up

2014-03-14 Thread Kevin Hilman

Tomasz Figa  writes:

> This patch introduces generic code to perform power domain look-up using
> device tree and automatically bind devices to their power domains.
> Generic device tree binding is introduced to specify power domains of
> devices in their device tree nodes.
>
> Backwards compatibility with legacy Samsung-specific power domain
> bindings is provided, but for now the new code is not compiled when
> CONFIG_ARCH_EXYNOS is selected to avoid collision with legacy code. This
> will change as soon as Exynos power domain code gets converted to use
> the generic framework in further patch.
>
> Signed-off-by: Tomasz Figa 

Reviewed-by: Kevin Hilman 

The approach and binding both look good to me, other than a few minor
nits on comments and question on the locking below...

[...]

> @@ -2177,3 +2181,297 @@ void pm_genpd_init(struct generic_pm_domain *genpd,
>   list_add(>gpd_list_node, _list);
>   mutex_unlock(_list_lock);
>  }
> +
> +#ifdef CONFIG_PM_GENERIC_DOMAINS_OF
> +/*
> + * DEVICE TREE BASED POWER DOMAIN PROVIDERS

why all caps?

[...]

> +/* See of_genpd_get_from_provider(). */
> +static struct generic_pm_domain *__of_genpd_get_from_provider(
> + struct of_phandle_args *genpdspec)
> +{
> + struct of_genpd_provider *provider;
> + struct generic_pm_domain *genpd = ERR_PTR(-EPROBE_DEFER);
> +
> + /* Check if we have such a provider in our array */

I think you want to take the mutex here...

> + list_for_each_entry(provider, _genpd_providers, link) {
> + if (provider->node == genpdspec->np)
> + genpd = provider->xlate(genpdspec, provider->data);
> + if (!IS_ERR(genpd))
> + break;
> + }

...and release it here, right?

[...]

> +/*
> + * DEVICE<->DOMAIN BINDING USING DEVICE TREE LOOK-UP

hmm, more yelling?


Otherwise, looks good to me.

Kevin
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] [RFC] perf: Fix a race between ring_buffer_detach() and ring_buffer_wakeup()

2014-03-14 Thread Paul E. McKenney

On Fri, Mar 14, 2014 at 11:43:17PM +0100, Peter Zijlstra wrote:
> On Fri, Mar 14, 2014 at 01:47:37PM -0700, Paul E. McKenney wrote:
> > This general idea can be made to work, but it will need some
> > internal-to-RCU help.  One vulnerability of the patch below is the
> > following sequence of steps:
> > 
> > 1.  RCU has just finished a grace period, and is doing the
> > end-of-grace-period accounting.
> > 
> > 2.  The code below invokes rcu_batches_completed().  Let's assume
> > the result returned is 42.
> > 
> > 3.  RCU completes the end-of-grace-period accounting, and increments
> > rcu_sched_state.completed.
> > 
> > 4.  The code below checks ->rcu_batches against the result from
> > another invocation of rcu_batches_completed() and sees that
> > the 43 is not equal to 42, so skips the synchronize_rcu().
> > 
> > Except that a grace period has not actually completed.  Boom!!!
> > 
> > The problem is that rcu_batches_completed() is only intended to give
> > progress information on RCU.
> 
> Ah, I thought I was missing something when I was looking through the rcu
> code in a hurry :-)

Well, given that I sometimes miss things when looking through RCU code
carefuly, I guess I cannot give you too much trouble about it.

> I knew there'd be some subtlety between completed and gpnum and such :-)

Some of which I have learned about one RCU bug at a time.  ;-)

> > What I can do is give you a pair of functions, one to take a snapshot of
> > the current grace-period state (returning an unsigned long) and another
> > to evaluate a previous snapshot, invoking synchronize_rcu() if there has
> > not been a full grace period in the meantime.
> > 
> > The most straightforward approach would invoke acquiring the global
> > rcu_state ->lock on each call, which I am guessing just might be
> > considered to be excessive overhead.  ;-)  I should be able to decrease
> > the overhead to a memory barrier on each call, and perhaps even down
> > to an smp_load_acquire().  Accessing the RCU state probably costs you
> > a cache miss both times.
> > 
> > Thoughts?
> 
> Sounds fine, the attach isn't a hotpath, so even the locked version
> should be fine, but I won't keep you from making it all fancy and such
> :-)

Fair enough, let me see what I can come up with.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 8/9] powerpc/85xx: add save/restore functions for core registers

2014-03-14 Thread Scott Wood

On Wed, 2014-03-12 at 17:42 +0800, Chenhui Zhao wrote:
> On Tue, Mar 11, 2014 at 07:45:14PM -0500, Scott Wood wrote:
> > On Fri, 2014-03-07 at 12:58 +0800, Chenhui Zhao wrote:
> > > From: Wang Dongsheng 
> > > 
> > > Add booke_cpu_state_save() and booke_cpu_state_restore() functions which 
> > > can be
> > > used to save/restore CPU's registers in the case of deep sleep and 
> > > hibernation.
> > > 
> > > Supported processors: E6500, E5500, E500MC, E500v2 and E500v1.
> > > 
> > > Signed-off-by: Wang Dongsheng 
> > > Signed-off-by: Chenhui Zhao 
> > > ---
> > >  arch/powerpc/include/asm/booke_save_regs.h |   96 
> > >  arch/powerpc/kernel/Makefile   |1 +
> > >  arch/powerpc/kernel/booke_save_regs.S  |  361 
> > > 
> > >  3 files changed, 458 insertions(+), 0 deletions(-)
> > >  create mode 100644 arch/powerpc/include/asm/booke_save_regs.h
> > >  create mode 100644 arch/powerpc/kernel/booke_save_regs.S
> > > 
> > > diff --git a/arch/powerpc/include/asm/booke_save_regs.h 
> > > b/arch/powerpc/include/asm/booke_save_regs.h
> > > new file mode 100644
> > > index 000..87c357a
> > > --- /dev/null
> > > +++ b/arch/powerpc/include/asm/booke_save_regs.h
> > > @@ -0,0 +1,96 @@
> > > +/*
> > > + *  Save/restore e500 series core registers
> > 
> > Filename says booke, comment says e500.
> > 
> > Filename and comment also fail to point out that this is specifically
> > for standby/suspend, not for hibernate which is implemented in
> > swsusp_booke.S/swsusp_asm64.S.
> 
> Sorry for inconsistency. Will changes e500 to booke.
> Hibernation and suspend can share the code.

Maybe they could, but AFAICT this patchset doesn't make that happen --
and I'm not convinced that the churn would be worthwhile.  Note that
swsusp_asm64.S is not just for booke, so most of that file would not be
going away if you did make such a change.

I also don't like the way it looks like booke_save_regs.S is a booke
version of ppc_save_regs.S, even though they serve different purposes
and ppc_save_regs.S is still relevant to booke.

> > > + * Software-Use Registers
> > > + *   SPRG1   0x260   (dw * 76), 64-bit need 
> > > to save.
> > > + *   SPRG3   0x268   (dw * 77), 32-bit need 
> > > to save.
> > 
> > What about "CPU and NUMA node for VDSO getcpu" on 64-bit?  Currently
> > SPRG3, but it will need to change for critical interrupt support.
> > 
> > > + * MMU Registers
> > > + *   PID0 - PID2 0x270 ~ 0x280   (dw * 78 ~ dw * 80)
> > 
> > PID1/PID2 are e500v1/v2 only -- and Linux doesn't use them outside of
> > KVM (and you're not in KVM when you're running this code).
> > 
> > Are we ever going to have a non-zero PID at this point?
> 
> I incline to the view that saving all registers regardless of used or
> unused. The good point is that it can be compliant to the future
> changes of the usage of registers.
> 
> What do you think?

I agree to a certain extent, but balance it with the complexity of
dealing with registers that don't exist on all booke chips.  If they
don't really need to be saved, why go through the hassle of conditional
code?

> > > + * Debug Registers
> > > + *   DBCR0 - DBCR2   0x288 ~ 0x298   (dw * 81 ~ dw * 83)
> > > + *   IAC1 - IAC4 0x2a0 ~ 0x2b8   (dw * 84 ~ dw * 87)
> > > + *   DAC1 - DAC2 0x2c0 ~ 0x2c8   (dw * 88 ~ dw * 89)
> > > + *
> > > + */
> > 
> > IAC3-4 are not implemented on e500.
> > 
> > Do we really need to save the debug registers?  We're not going to be in
> > a debugged process when we do suspend.  If the concern is kgdb, it
> > probably needs to be told to get out of the way during suspend for other
> > reasons.
> 
> I think in the ideal case the suspend would not break any context. We
> should try to save/restore all cpu state. Of cause, trade-off is
> unavoidable in practice.
> 
> > 
> > > +#define SR_GPR1  0x000
> > > +#define SR_GPR2  0x008
> > > +#define SR_GPR13 0x010
> > > +#define SR_FPR14 0x0a8
> > > +#define SR_CR0x138
> > > +#define SR_LR0x140
> > > +#define SR_MSR   0x148
> > 
> > These are very vague names to be putting in a header file.
> 
> How about BOOKE_xx_OFFSET?

Better, but does it need to be in a public header file at all?
 
> > > +/*
> > > + * hibernation and deepsleep save/restore different number of registers,
> > > + * use these flags to indicate.
> > > + */
> > > +#define HIBERNATION_FLAG 1
> > > +#define DEEPSLEEP_FLAG   2
> > 
> > Again, namespacing -- but why is hibernation using this at all?  What's
> > wrong with the existing hibernation support?
> 
> How about BOOKE_HIBERNATION_FLAG?
> 
> Just wish to share code between hibernation and suspend.

No need until and unless you actually implement the change for
hibernation to use this.  As is, this is dead and untestable code.
 
> > >

[PATCH 2/2] Cleanup useless architecture versions of scatterlist.h

2014-03-14 Thread Laura Abbott

There's no need to have an architecture version of scatterlist.h
if the only thing the file does is include asm-generic/scatterlist.h.
Switch to the asm-generic versions directly.

Cc: Mikael Starvik 
Cc: Jesper Nilsson 
Cc: David Howells 
Cc: Hirokazu Takata ,
Cc: Michal Simek 
Cc: David Howells 
Cc: Koichi Yasutake 
Cc: Chen Liqin ,
Cc: Lennox Wu 
Signed-off-by: Laura Abbott 
---
 arch/alpha/include/asm/Kbuild |  1 +
 arch/alpha/include/asm/scatterlist.h  |  6 --
 arch/cris/include/asm/Kbuild  |  1 +
 arch/cris/include/asm/scatterlist.h   |  6 --
 arch/frv/include/asm/Kbuild   |  1 +
 arch/frv/include/asm/scatterlist.h|  6 --
 arch/m32r/include/asm/Kbuild  |  1 +
 arch/m32r/include/asm/scatterlist.h   |  6 --
 arch/microblaze/include/asm/Kbuild|  1 +
 arch/microblaze/include/asm/scatterlist.h |  1 -
 arch/mn10300/include/asm/Kbuild   |  1 +
 arch/mn10300/include/asm/scatterlist.h| 16 
 arch/score/include/asm/Kbuild |  2 +-
 arch/score/include/asm/scatterlist.h  |  6 --
 14 files changed, 7 insertions(+), 48 deletions(-)
 delete mode 100644 arch/alpha/include/asm/scatterlist.h
 delete mode 100644 arch/cris/include/asm/scatterlist.h
 delete mode 100644 arch/frv/include/asm/scatterlist.h
 delete mode 100644 arch/m32r/include/asm/scatterlist.h
 delete mode 100644 arch/microblaze/include/asm/scatterlist.h
 delete mode 100644 arch/mn10300/include/asm/scatterlist.h
 delete mode 100644 arch/score/include/asm/scatterlist.h

diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index a73a8e2..2eeb6f2 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -5,3 +5,4 @@ generic-y += exec.h
 generic-y += trace_clock.h
 generic-y += preempt.h
 generic-y += hash.h
+generic-y += scatterlist.h
diff --git a/arch/alpha/include/asm/scatterlist.h 
b/arch/alpha/include/asm/scatterlist.h
deleted file mode 100644
index 017d747..000
--- a/arch/alpha/include/asm/scatterlist.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ALPHA_SCATTERLIST_H
-#define _ALPHA_SCATTERLIST_H
-
-#include 
-
-#endif /* !(_ALPHA_SCATTERLIST_H) */
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild
index f3fd876..d9e86c7 100644
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -14,3 +14,4 @@ generic-y += trace_clock.h
 generic-y += vga.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += scatterlist.h
diff --git a/arch/cris/include/asm/scatterlist.h 
b/arch/cris/include/asm/scatterlist.h
deleted file mode 100644
index f11f8f4..000
--- a/arch/cris/include/asm/scatterlist.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_CRIS_SCATTERLIST_H
-#define __ASM_CRIS_SCATTERLIST_H
-
-#include 
-
-#endif /* !(__ASM_CRIS_SCATTERLIST_H) */
diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild
index bc42f14..5ccdc46 100644
--- a/arch/frv/include/asm/Kbuild
+++ b/arch/frv/include/asm/Kbuild
@@ -4,3 +4,4 @@ generic-y += exec.h
 generic-y += trace_clock.h
 generic-y += preempt.h
 generic-y += hash.h
+generic-y += scatterlist.h
diff --git a/arch/frv/include/asm/scatterlist.h 
b/arch/frv/include/asm/scatterlist.h
deleted file mode 100644
index 0e5eb30..000
--- a/arch/frv/include/asm/scatterlist.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_SCATTERLIST_H
-#define _ASM_SCATTERLIST_H
-
-#include 
-
-#endif /* !_ASM_SCATTERLIST_H */
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild
index 932435a..8672b27 100644
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -2,6 +2,7 @@
 generic-y += clkdev.h
 generic-y += exec.h
 generic-y += module.h
+generic-y += scatterlist.h
 generic-y += trace_clock.h
 generic-y += preempt.h
 generic-y += hash.h
diff --git a/arch/m32r/include/asm/scatterlist.h 
b/arch/m32r/include/asm/scatterlist.h
deleted file mode 100644
index 7370b8b..000
--- a/arch/m32r/include/asm/scatterlist.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_M32R_SCATTERLIST_H
-#define _ASM_M32R_SCATTERLIST_H
-
-#include 
-
-#endif /* _ASM_M32R_SCATTERLIST_H */
diff --git a/arch/microblaze/include/asm/Kbuild 
b/arch/microblaze/include/asm/Kbuild
index 2b98bc7..9270f69 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -4,5 +4,6 @@ generic-y += clkdev.h
 generic-y += exec.h
 generic-y += hash.h
 generic-y += trace_clock.h
+generic-y += scatterlist.h
 generic-y += syscalls.h
 generic-y += preempt.h
diff --git a/arch/microblaze/include/asm/scatterlist.h 
b/arch/microblaze/include/asm/scatterlist.h
deleted file mode 100644
index 35d786f..000
--- a/arch/microblaze/include/asm/scatterlist.h
+++ /dev/null
@@ -1 +0,0 @@
-#include 
diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild
index 992e989..3045ffe 100644
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@ -3,5 +3,6 @@ generic-y +=

[PATCH 1/2] lib/scatterlist: Make ARCH_HAS_SG_CHAIN an actual Kconfig

2014-03-14 Thread Laura Abbott

Rather than have architectures #define ARCH_HAS_SG_CHAIN in an architecture
specific scatterlist.h, make it a proper Kconfig option and use that
instead. At same time, remove the header files are are now mostly
useless and just include asm-generic/scatterlist.h.

Cc: Russell King 
Cc: Tony Luck 
Cc: Fenghua Yu 
Benjamin Herrenschmidt 
Paul Mackerras 
Signed-off-by: Laura Abbott 
---
 arch/arm/Kconfig   |  1 +
 arch/arm/include/asm/Kbuild|  1 +
 arch/arm/include/asm/scatterlist.h | 12 
 arch/arm64/Kconfig |  1 +
 arch/ia64/Kconfig  |  1 +
 arch/ia64/include/asm/Kbuild   |  1 +
 arch/ia64/include/asm/scatterlist.h|  7 ---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/Kbuild|  1 +
 arch/powerpc/include/asm/scatterlist.h | 17 -
 arch/s390/Kconfig  |  1 +
 arch/s390/include/asm/Kbuild   |  1 +
 arch/s390/include/asm/scatterlist.h|  3 ---
 arch/sparc/Kconfig |  1 +
 arch/sparc/include/asm/Kbuild  |  1 +
 arch/sparc/include/asm/scatterlist.h   |  8 
 arch/x86/Kconfig   |  1 +
 arch/x86/include/asm/Kbuild|  1 +
 arch/x86/include/asm/scatterlist.h |  8 
 include/linux/scatterlist.h|  2 +-
 include/scsi/scsi.h|  2 +-
 lib/Kconfig|  7 +++
 lib/scatterlist.c  |  4 ++--
 23 files changed, 24 insertions(+), 59 deletions(-)
 delete mode 100644 arch/arm/include/asm/scatterlist.h
 delete mode 100644 arch/ia64/include/asm/scatterlist.h
 delete mode 100644 arch/powerpc/include/asm/scatterlist.h
 delete mode 100644 arch/s390/include/asm/scatterlist.h
 delete mode 100644 arch/sparc/include/asm/scatterlist.h
 delete mode 100644 arch/x86/include/asm/scatterlist.h

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 1594945..8122294 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -82,6 +82,7 @@ config ARM
  .
 
 config ARM_HAS_SG_CHAIN
+   select ARCH_HAS_SG_CHAIN
bool
 
 config NEED_SG_DMA_LENGTH
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 3278afe..2357ed6 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -18,6 +18,7 @@ generic-y += param.h
 generic-y += parport.h
 generic-y += poll.h
 generic-y += resource.h
+generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += sembuf.h
diff --git a/arch/arm/include/asm/scatterlist.h 
b/arch/arm/include/asm/scatterlist.h
deleted file mode 100644
index cefdb8f..000
--- a/arch/arm/include/asm/scatterlist.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _ASMARM_SCATTERLIST_H
-#define _ASMARM_SCATTERLIST_H
-
-#ifdef CONFIG_ARM_HAS_SG_CHAIN
-#define ARCH_HAS_SG_CHAIN
-#endif
-
-#include 
-#include 
-#include 
-
-#endif /* _ASMARM_SCATTERLIST_H */
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 27bbcfc..f2f95f4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2,6 +2,7 @@ config ARM64
def_bool y
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_USE_CMPXCHG_LOCKREF
+   select ARCH_HAS_SG_CHAIN
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_WANT_OPTIONAL_GPIOLIB
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 0c8e553..13e2e8b 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -44,6 +44,7 @@ config IA64
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
select ARCH_USE_CMPXCHG_LOCKREF
+   select ARCH_HAS_SG_CHAIN
default y
help
  The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 283a831..3906865 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -2,6 +2,7 @@
 generic-y += clkdev.h
 generic-y += exec.h
 generic-y += kvm_para.h
+generic-y += scatterlist.h
 generic-y += trace_clock.h
 generic-y += preempt.h
 generic-y += vtime.h
diff --git a/arch/ia64/include/asm/scatterlist.h 
b/arch/ia64/include/asm/scatterlist.h
deleted file mode 100644
index 08fd93b..000
--- a/arch/ia64/include/asm/scatterlist.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _ASM_IA64_SCATTERLIST_H
-#define _ASM_IA64_SCATTERLIST_H
-
-#include 
-#define ARCH_HAS_SG_CHAIN
-
-#endif /* _ASM_IA64_SCATTERLIST_H */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 957bf34..659aee2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -141,6 +141,7 @@ config PPC
select HAVE_DEBUG_STACKOVERFLOW
select HAVE_IRQ_EXIT_ON_IRQ_STACK
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
+   select ARCH_HAS_SG_CHAIN
 
 config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git

Re: Trusted kernel patchset for Secure Boot lockdown

2014-03-14 Thread Matthew Garrett

On Fri, 2014-03-14 at 22:31 +, One Thousand Gnomes wrote:
> On Fri, 14 Mar 2014 22:15:45 +
> Matthew Garrett  wrote:
> > The general problem includes having to support this even without an
> > selinux policy.
> 
> Yes. No dispute about that. But equally the general solution should allow
> for it.

Well, sure. The current implementation doesn't conflict with selinux in
any way.

> > some other way. ChromeOS will load unmeasured kernel modules provided it
> > can attest to the trustworthyness of the filesystem containing them.
> 
> See "How to Bypass Verified Boot Security in Chromium OS" 8)
> 
> And it attests the trustworthiness of the filesystem by measuring it. If
> you have a measurement of object X that states it is unchanged then you
> have a valid measurement of any subset of object X for which the same
> assertion is proven. In this case since you know all the bits in the root
> fs are as before, so you know all the bits in the module are as before

You may attest to the trustworthiness of a filesystem by measuring it,
but you may also attest to it via some other means - for instance, it's
read-only and stored on media that requires physical presence to
modify. 

-- 
Matthew Garrett

Re: [PATCH 7/9] fsl: add EPU FSM configuration for deep sleep

2014-03-14 Thread Scott Wood

On Wed, 2014-03-12 at 16:34 +0800, Chenhui Zhao wrote:
> On Tue, Mar 11, 2014 at 07:08:43PM -0500, Scott Wood wrote:
> > On Fri, 2014-03-07 at 12:58 +0800, Chenhui Zhao wrote:
> > > From: Hongbo Zhang 
> > > 
> > > In the last stage of deep sleep, software will trigger a Finite
> > > State Machine (FSM) to control the hardware precedure, such as
> > > board isolation, killing PLLs, removing power, and so on.
> > > 
> > > When the system is waked up by an interrupt, the FSM controls the
> > > hardware to complete the early resume precedure.
> > > 
> > > This patch configure the EPU FSM preparing for deep sleep.
> > > 
> > > Signed-off-by: Hongbo Zhang 
> > > Signed-off-by: Chenhui Zhao 
> > 
> > Couldn't this be part of qoriq_pm.c?
> 
> Put the code in drivers/platform/fsl/ so that LS1 can share these code.

How can LS1 share it if it's got hardcoded T1040 values?

> > > diff --git a/drivers/platform/Kconfig b/drivers/platform/Kconfig
> > > index 09fde58..6539e6d 100644
> > > --- a/drivers/platform/Kconfig
> > > +++ b/drivers/platform/Kconfig
> > > @@ -6,3 +6,7 @@ source "drivers/platform/goldfish/Kconfig"
> > >  endif
> > >  
> > >  source "drivers/platform/chrome/Kconfig"
> > > +
> > > +if FSL_SOC
> > > +source "drivers/platform/fsl/Kconfig"
> > > +endif
> > 
> > Chrome doesn't need an ifdef -- why does this?
> 
> Don't wish other platform see these options, and the X86 and GOLDFISH have
> ifdefs.

The point is you can implement the dependency inside
drivers/platform/fsl/Kconfig.

> > > diff --git a/drivers/platform/fsl/Makefile b/drivers/platform/fsl/Makefile
> > > new file mode 100644
> > > index 000..d99ca0e
> > > --- /dev/null
> > > +++ b/drivers/platform/fsl/Makefile
> > > @@ -0,0 +1,5 @@
> > > +#
> > > +# Makefile for linux/drivers/platform/fsl
> > > +# Freescale Specific Power Management Drivers
> > > +#
> > > +obj-$(CONFIG_FSL_SLEEP_FSM)  += sleep_fsm.o
> > 
> > Why is this here while the other stuff is in arch/powerpc/sysdev?
> > 
> > > +/* Block offsets */
> > > +#define  RCPM_BLOCK_OFFSET   0x00022000
> > > +#define  EPU_BLOCK_OFFSET0x
> > > +#define  NPC_BLOCK_OFFSET0x1000
> > 
> > Why don't these block offsets come from the device tree?
> 
> Have maped DCSR registers. Don't wish to remap them.

We don't wish to have hardcoded CCSR/DCSR offsets in the kernel source.
Sorry.
 
> > > + /* Configure the EPU Counters */
> > > + epu_write(EPCCR15, 0x9284);
> > > + epu_write(EPCCR14, 0x9284);
> > > + epu_write(EPCCR12, 0x9284);
> > > + epu_write(EPCCR11, 0x9284);
> > > + epu_write(EPCCR10, 0x9284);
> > > + epu_write(EPCCR9, 0x9284);
> > > + epu_write(EPCCR8, 0x9284);
> > > + epu_write(EPCCR5, 0x9284);
> > > + epu_write(EPCCR4, 0x9284);
> > > + epu_write(EPCCR2, 0x9284);
> > > +
> > > + /* Configure the SCUs Inputs */
> > > + epu_write(EPSMCR15, 0x7600);
> > > + epu_write(EPSMCR14, 0x0031);
> > > + epu_write(EPSMCR13, 0x3100);
> > > + epu_write(EPSMCR12, 0x7F00);
> > > + epu_write(EPSMCR11, 0x3174);
> > > + epu_write(EPSMCR10, 0x6530);
> > > + epu_write(EPSMCR9, 0x3000);
> > > + epu_write(EPSMCR8, 0x6430);
> > > + epu_write(EPSMCR7, 0x3000);
> > > + epu_write(EPSMCR6, 0x7C00);
> > > + epu_write(EPSMCR5, 0x2E00);
> > > + epu_write(EPSMCR4, 0x002F);
> > > + epu_write(EPSMCR3, 0x2F00);
> > > + epu_write(EPSMCR2, 0x6C70);
> > 
> > Where do these magic numbers come from?  Which chips are they valid for?
> 
> They are for T1040. Can be found in the RCPM chapter of T1040RM.

Then put in a comment to that effect, including what part of the RCPM
chapter.

How do you plan to handle the addition of another SoC with different
values?

-Scott


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[GIT PULL] x86 fixes for v3.14-rc7

2014-03-14 Thread H. Peter Anvin

Hi Linus,

Two x86 fixes: Suresh's eager FPU fix, and a fix to the NUMA quirk for
AMD northbridges.

This only includes Suresh's fix patch, not the "mostly a cleanup"
patch which had __init issues.

The following changes since commit fa389e220254c69ffae0d403eac4146171062d08:

  Linux 3.14-rc6 (2014-03-09 19:41:57 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86-urgent-for-linus

for you to fetch changes up to 847d7970defb45540735b3fb4e88471c27cacd85:

  x86/amd/numa: Fix northbridge quirk to assign correct NUMA node (2014-03-14 
11:05:36 +0100)


Daniel J Blueman (1):
  x86/amd/numa: Fix northbridge quirk to assign correct NUMA node

Suresh Siddha (1):
  x86, fpu: Check tsk_used_math() in kernel_fpu_end() for eager FPU

 arch/x86/kernel/i387.c   | 15 ---
 arch/x86/kernel/quirks.c |  2 +-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index e8368c6dd2a2..d5dd80814419 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -86,10 +86,19 @@ EXPORT_SYMBOL(__kernel_fpu_begin);
 
 void __kernel_fpu_end(void)
 {
-   if (use_eager_fpu())
-   math_state_restore();
-   else
+   if (use_eager_fpu()) {
+   /*
+* For eager fpu, most the time, tsk_used_math() is true.
+* Restore the user math as we are done with the kernel usage.
+* At few instances during thread exit, signal handling etc,
+* tsk_used_math() is false. Those few places will take proper
+* actions, so we don't need to restore the math here.
+*/
+   if (likely(tsk_used_math(current)))
+   math_state_restore();
+   } else {
stts();
+   }
 }
 EXPORT_SYMBOL(__kernel_fpu_end);
 
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 7c6acd4b8995..ff898bbf579d 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -529,7 +529,7 @@ static void quirk_amd_nb_node(struct pci_dev *dev)
return;
 
pci_read_config_dword(nb_ht, 0x60, );
-   node = val & 7;
+   node = pcibus_to_node(dev->bus) | (val & 7);
/*
 * Some hardware may return an invalid node ID,
 * so check it first:
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 6/9] powerpc/85xx: support sleep feature on QorIQ SoCs with RCPM

2014-03-14 Thread Scott Wood

On Wed, 2014-03-12 at 16:08 +0800, Chenhui Zhao wrote:
> On Tue, Mar 11, 2014 at 07:00:27PM -0500, Scott Wood wrote:
> > On Fri, 2014-03-07 at 12:58 +0800, Chenhui Zhao wrote:
> > > In sleep mode, the clocks of e500 cores and unused IP blocks is
> > > turned off. The IP blocks which are allowed to wake up the processor
> > > are still running.
> > > 
> > > The sleep mode is equal to the Standby state in Linux. Use the
> > > command to enter sleep mode:
> > >   echo standby > /sys/power/state
> > > 
> > > Signed-off-by: Chenhui Zhao 
> > > ---
> > >  arch/powerpc/Kconfig   |4 +-
> > >  arch/powerpc/platforms/85xx/Makefile   |3 +
> > >  arch/powerpc/platforms/85xx/qoriq_pm.c |   78 
> > > 
> > >  3 files changed, 83 insertions(+), 2 deletions(-)
> > >  create mode 100644 arch/powerpc/platforms/85xx/qoriq_pm.c
> > > 
> > > diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> > > index 05f6323..e1d6510 100644
> > > --- a/arch/powerpc/Kconfig
> > > +++ b/arch/powerpc/Kconfig
> > > @@ -222,7 +222,7 @@ config ARCH_HIBERNATION_POSSIBLE
> > >  config ARCH_SUSPEND_POSSIBLE
> > >   def_bool y
> > >   depends on ADB_PMU || PPC_EFIKA || PPC_LITE5200 || PPC_83xx || \
> > > -(PPC_85xx && !PPC_E500MC) || PPC_86xx || PPC_PSERIES \
> > > +FSL_SOC_BOOKE || PPC_86xx || PPC_PSERIES \
> > >  || 44x || 40x
> > >  
> > >  config PPC_DCR_NATIVE
> > > @@ -709,7 +709,7 @@ config FSL_PCI
> > >  config FSL_PMC
> > >   bool
> > >   default y
> > > - depends on SUSPEND && (PPC_85xx || PPC_86xx)
> > > + depends on SUSPEND && (PPC_85xx && !PPC_E500MC || PPC_86xx)
> > 
> > Don't mix && and || without parentheses.
> > 
> > Maybe convert this into being selected (similar to FSL_RCPM), rather
> > than default y?
> 
> Yes, will do.
> 
> > 
> > > diff --git a/arch/powerpc/platforms/85xx/Makefile 
> > > b/arch/powerpc/platforms/85xx/Makefile
> > > index 25cebe7..7fae817 100644
> > > --- a/arch/powerpc/platforms/85xx/Makefile
> > > +++ b/arch/powerpc/platforms/85xx/Makefile
> > > @@ -2,6 +2,9 @@
> > >  # Makefile for the PowerPC 85xx linux kernel.
> > >  #
> > >  obj-$(CONFIG_SMP) += smp.o
> > > +ifeq ($(CONFIG_FSL_CORENET_RCPM), y)
> > > +obj-$(CONFIG_SUSPEND)+= qoriq_pm.o
> > > +endif
> > 
> > There should probably be a kconfig symbol for this.
> 
> OK.
> 
> > 
> > > diff --git a/arch/powerpc/platforms/85xx/qoriq_pm.c 
> > > b/arch/powerpc/platforms/85xx/qoriq_pm.c
> > > new file mode 100644
> > > index 000..915b13b
> > > --- /dev/null
> > > +++ b/arch/powerpc/platforms/85xx/qoriq_pm.c
> > > @@ -0,0 +1,78 @@
> > > +/*
> > > + * Support Power Management feature
> > > + *
> > > + * Copyright 2014 Freescale Semiconductor Inc.
> > > + *
> > > + * Author: Chenhui Zhao 
> > > + *
> > > + * This program is free software; you can redistribute   it and/or 
> > > modify it
> > > + * under  the terms of   the GNU General  Public License as published by 
> > > the
> > > + * Free Software Foundation;  either version 2 of the  License, or (at 
> > > your
> > > + * option) any later version.
> > > + */
> > > +
> > > +#include 
> > > +#include 
> > > +#include 
> > > +
> > > +#include 
> > > +
> > > +#define FSL_SLEEP0x1
> > > +#define FSL_DEEP_SLEEP   0x2
> > 
> > FSL_DEEP_SLEEP is unused.
> 
> Will be used in the last patch.
> [PATCH 9/9] powerpc/pm: support deep sleep feature on T1040

Ideally the #define would have been introduced in that patch.

> > > + sleep_modes = FSL_SLEEP;
> > > + sleep_pm_state = PLAT_PM_SLEEP;
> > > +
> > > + np = of_find_compatible_node(NULL, NULL, "fsl,qoriq-rcpm-2.0");
> > > + if (np)
> > > + sleep_pm_state = PLAT_PM_LPM20;
> > > +
> > > + suspend_set_ops(_suspend_ops);
> > > +
> > > + return 0;
> > > +}
> > > +arch_initcall(qoriq_suspend_init);
> > 
> > Why is this not a platform driver?  If fsl_pmc can do it...
> > 
> > -Scott
> 
> It can be, but what advantage of being a platform driver.

If nothing else, compliance with the standard way of doing things.  Why
not make it a platform driver?  You'd be able to use dev_err, have a
place in sysfs if attributes are needed in the future, etc.

A better answer might be that there are multiple not-very-related files
driving different portions of RCPM.

-Scott


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch v3]DM: dm-insitu-comp: a compressed DM target for SSD

2014-03-14 Thread Mike Snitzer

On Fri, Mar 14 2014 at  5:40am -0400,
Shaohua Li  wrote:

> On Mon, Mar 10, 2014 at 09:52:56AM -0400, Mike Snitzer wrote:
> > On Fri, Mar 07 2014 at  2:57am -0500,
> > Shaohua Li  wrote:
> > 
> > > ping!
> > 
> > Hi,
> > 
> > I intend to get dm-insitu-comp reviewed for 3.15.  Sorry I haven't
> > gotten back with you before now, been busy tending to 3.14-rc issues.
> > 
> > I took a quick first pass over your code a couple weeks ago.  Looks to
> > be in great shape relative to coding conventions and the more DM
> > specific conventions.  Clearly demonstrates you have a good command of
> > DM concepts and quirks.

Think I need to eat my words from above at least partially.  Given you
haven't implemented any of the target suspend or resume hooks this
target will _not_ work properly across suspend + resume cycles that all
DM targets must support.

But we can obviously work through it with urgency for 3.15.

I've pulled your v3 patch into git and have overlayed edits from my
first pass.  Lots of funky wrapping to conform to 80 columns.  But
whitespace aside, I've added FIXME:s in the relevant files.  If you work
on any of these FIXMEs please send follow-up patches so that we don't
step on each others' toes.

Please see the 'for-3.15-insitu-comp' branch of this git repo:
git://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git

https://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/log/?h=for-3.15-insitu-comp

> > But one thing that would really help get dm-insitu-comp into 3.15 is to
> > show that the code is working as you'd expect.  To that end, it'd be
> > great if you'd be willing to add dm-insitu-comp support to the
> > device-mapper-test-suite, see:
> > https://github.com/jthornber/device-mapper-test-suite
> > 
> > I recently added barebones/simple dm-crypt support, see:
> > https://github.com/jthornber/device-mapper-test-suite/commit/c865bcd4e48228e18626d94327fb2485cf9ec9a1
> > 
> > But It may be that activation/test code for the other targets (e.g. thin
> > or cache) are more useful examples to follow for implemnting
> > dm-insitu-comp stack activation, see:
> > https://github.com/jthornber/device-mapper-test-suite/blob/master/lib/dmtest/pool-stack.rb
> > https://github.com/jthornber/device-mapper-test-suite/blob/master/lib/dmtest/cache_stack.rb
> > 
> > All said, implementing dm-insitu-comp support for dmts (including some
> > tests that establish it is working as intended) isn't a hard requirement
> > for getting the target upstream but it would _really_ help.
> 
> Ok, I added some simple tests in the test suites.

OK, I missed this before because it was an attachment.  I was confused
as to whether you already added or will add support.  Now that I've
replied to this mail mutt pulled in the attachment ;)

I'll take for a spin on Monday (or over the weekend if I'm bored).

Thanks,
Mike
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] [RFC] perf: Fix a race between ring_buffer_detach() and ring_buffer_wakeup()

2014-03-14 Thread Peter Zijlstra

On Fri, Mar 14, 2014 at 01:47:37PM -0700, Paul E. McKenney wrote:
> This general idea can be made to work, but it will need some
> internal-to-RCU help.  One vulnerability of the patch below is the
> following sequence of steps:
> 
> 1.RCU has just finished a grace period, and is doing the
>   end-of-grace-period accounting.
> 
> 2.The code below invokes rcu_batches_completed().  Let's assume
>   the result returned is 42.
> 
> 3.RCU completes the end-of-grace-period accounting, and increments
>   rcu_sched_state.completed.
> 
> 4.The code below checks ->rcu_batches against the result from
>   another invocation of rcu_batches_completed() and sees that
>   the 43 is not equal to 42, so skips the synchronize_rcu().
> 
> Except that a grace period has not actually completed.  Boom!!!
> 
> The problem is that rcu_batches_completed() is only intended to give
> progress information on RCU.

Ah, I thought I was missing something when I was looking through the rcu
code in a hurry :-)

I knew there'd be some subtlety between completed and gpnum and such :-)

> What I can do is give you a pair of functions, one to take a snapshot of
> the current grace-period state (returning an unsigned long) and another
> to evaluate a previous snapshot, invoking synchronize_rcu() if there has
> not been a full grace period in the meantime.
> 
> The most straightforward approach would invoke acquiring the global
> rcu_state ->lock on each call, which I am guessing just might be
> considered to be excessive overhead.  ;-)  I should be able to decrease
> the overhead to a memory barrier on each call, and perhaps even down
> to an smp_load_acquire().  Accessing the RCU state probably costs you
> a cache miss both times.
> 
> Thoughts?

Sounds fine, the attach isn't a hotpath, so even the locked version
should be fine, but I won't keep you from making it all fancy and such
:-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 01/10] perf, tools: Add jsmn `jasmine' JSON parser

2014-03-14 Thread David Ahern


On 3/14/14, 3:31 PM, Andi Kleen wrote:

@@ -374,6 +376,8 @@ LIB_OBJS += $(OUTPUT)util/stat.o
  LIB_OBJS += $(OUTPUT)util/record.o
  LIB_OBJS += $(OUTPUT)util/srcline.o
  LIB_OBJS += $(OUTPUT)util/data.o
+LIB_OBJS += $(OUTPUT)util/jsmn.o
+LIB_OBJS += $(OUTPUT)util/json.o

  LIB_OBJS += $(OUTPUT)ui/setup.o
  LIB_OBJS += $(OUTPUT)ui/helpline.o



CONFIG driven? Allow a user to omit this.

David

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 5/9] powerpc/85xx: disable irq by hardware when suspend for 64-bit

2014-03-14 Thread Scott Wood

On Wed, 2014-03-12 at 15:46 +0800, Chenhui Zhao wrote:
> On Tue, Mar 11, 2014 at 06:51:20PM -0500, Scott Wood wrote:
> > On Fri, 2014-03-07 at 12:58 +0800, Chenhui Zhao wrote:
> > > In 64-bit mode, kernel just clears the irq soft-enable flag
> > > in struct paca_struct to disable external irqs. But, in
> > > the case of suspend, irqs should be disabled by hardware.
> > > Therefore, hook a function to ppc_md.suspend_disable_irqs
> > > to really disable irqs.
> > > 
> > > Signed-off-by: Chenhui Zhao 
> > > ---
> > >  arch/powerpc/platforms/85xx/corenet_generic.c |   12 
> > >  1 files changed, 12 insertions(+), 0 deletions(-)
> > > 
> > > diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c 
> > > b/arch/powerpc/platforms/85xx/corenet_generic.c
> > > index 3fdf9f3..983d81f 100644
> > > --- a/arch/powerpc/platforms/85xx/corenet_generic.c
> > > +++ b/arch/powerpc/platforms/85xx/corenet_generic.c
> > > @@ -32,6 +32,13 @@
> > >  #include 
> > >  #include "smp.h"
> > >  
> > > +#if defined(CONFIG_PPC64) && defined(CONFIG_SUSPEND)
> > > +static void fsl_suspend_disable_irqs(void)
> > > +{
> > > + __hard_irq_disable();
> > > +}
> > > +#endif
> > 
> > Why the underscore version?  Don't you want PACA_IRQ_HARD_DIS to be set?
> > 
> > If hard disabling is appropriate here, shouldn't we do it in
> > generic_suspend_disable_irqs()?
> > 
> > Are there any existing platforms that supply a
> > ppc_md.suspend_disable_irqs()?  I don't see any when grepping.
> > 
> > -Scott
> 
> Will use hard_irq_disable().
> 
> I think this is a general problem for powerpc.
> Should clear MSR_EE before suspend. I agree to put it
> in generic_suspend_disable_irqs().

BTW, make sure you test this patchset with CONFIG_DEBUG_PREEMPT and
similar debugging options to help ensure that the soft IRQ state is
being tracked properly.

-Scott


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/9] powerpc/rcpm: add RCPM driver

2014-03-14 Thread Scott Wood

On Wed, 2014-03-12 at 11:59 +0800, Chenhui Zhao wrote:
> On Tue, Mar 11, 2014 at 06:42:51PM -0500, Scott Wood wrote:
> > On Fri, 2014-03-07 at 12:57 +0800, Chenhui Zhao wrote:
> > > +int fsl_rcpm_init(void)
> > > +{
> > > + struct device_node *np;
> > > +
> > > + np = of_find_compatible_node(NULL, NULL, "fsl,qoriq-rcpm-2.0");
> > > + if (np) {
> > > + rcpm_v2_regs = of_iomap(np, 0);
> > > + of_node_put(np);
> > > + if (!rcpm_v2_regs)
> > > + return -ENOMEM;
> > > +
> > > + qoriq_pm_ops = _rcpm_v2_ops;
> > > +
> > > + } else {
> > > + np = of_find_compatible_node(NULL, NULL, "fsl,qoriq-rcpm-1.0");
> > > + if (np) {
> > > + rcpm_v1_regs = of_iomap(np, 0);
> > > + of_node_put(np);
> > > + if (!rcpm_v1_regs)
> > > + return -ENOMEM;
> > > +
> > > + qoriq_pm_ops = _rcpm_v1_ops;
> > > +
> > > + } else {
> > > + pr_err("%s: can't find the rcpm node.\n", __func__);
> > > + return -EINVAL;
> > > + }
> > > + }
> > > +
> > > + return 0;
> > > +}
> > 
> > Why isn't this a proper platform driver?
> > 
> > -Scott
> 
> The RCPM is not a single function IP block, instead it is a collection
> of device run control and power management. It would be called by other
> drivers and functions. For example, the callback .freeze_time_base()
> need to be called at early stage of kernel init. Therefore, it would be
> better to init it at early stage.

OK, but consider using of_find_matching_node_and_match().

-Scott


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Trusted kernel patchset for Secure Boot lockdown

2014-03-14 Thread One Thousand Gnomes

On Fri, 14 Mar 2014 22:15:45 +
Matthew Garrett  wrote:

> On Fri, 2014-03-14 at 22:08 +, One Thousand Gnomes wrote:
> > On Fri, 14 Mar 2014 21:56:33 +
> > Matthew Garrett  wrote:
> > > Signed userspace is not a requirement, and therefore any solution that
> > > relies on a signed initrd is inadequate. There are use cases that
> > > require verification of the initrd and other levels. This isn't one of
> > > them.
> > 
> > The job of the kernel is to solve the general problem. There are lots of
> > people who happen to care about verification beyond the kernel so it
> > shouldn't be ignored. And they can do do things like load trusted SELinux
> > rulesets even if you can't support it in your environment.
> 
> The general problem includes having to support this even without an
> selinux policy.

Yes. No dispute about that. But equally the general solution should allow
for it.

> And one that's not going to change, so the general problem includes not
> relying on a signed initramfs.

Likewise

> some other way. ChromeOS will load unmeasured kernel modules provided it
> can attest to the trustworthyness of the filesystem containing them.

See "How to Bypass Verified Boot Security in Chromium OS" 8)

And it attests the trustworthiness of the filesystem by measuring it. If
you have a measurement of object X that states it is unchanged then you
have a valid measurement of any subset of object X for which the same
assertion is proven. In this case since you know all the bits in the root
fs are as before, so you know all the bits in the module are as before

And how do you know all the bits in the root fs are as before, because you
have a set of measurements (hashes) on partition 12. At the end of the
day you end up with a chain of measurements from a trusted thing you deep
immutable. If your chain has gaps you have holes (see above).

So ChromeOS loads *measured* kernel modules. It just did the measuring
differently to the signed module code.

Alan

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 9/9] powerpc/pm: support deep sleep feature on T1040

2014-03-14 Thread Scott Wood

On Thu, 2014-03-13 at 15:46 +0800, Kevin Hao wrote:
> On Wed, Mar 12, 2014 at 12:43:05PM -0500, Scott Wood wrote:
> > > Shouldn't we use "readback, sync" here? The following is quoted form 
> > > t4240RM:
> > >   To guarantee that the results of any sequence of writes to configuration
> > >   registers are in effect, the final configuration register write should 
> > > be
> > >   immediately followed by a read of the same register, and that should be
> > >   followed by a SYNC instruction. Then accesses can safely be made to 
> > > memory
> > >   regions affected by the configuration register write.
> > 
> > I agree that the sync before the readback is probably not necessary,
> > since transactions to the same address should already be ordered.
> > 
> > A sync after the readback helps if you're trying to order the readback
> > with subsequent memory accesses, though in that case wouldn't a sync
> > alone (no readback) be adequate?
> 
> No, we don't just want to order the subsequent memory access here.
> The 'write, readback, sync' is the required sequence if we want to make
> sure that the writing to CCSR register does really take effect.
> 
> >  Though maybe not always -- see the
> > comment near the end of fsl_elbc_write_buf() in
> > drivers/mtd/nand_fsl_elbc.c.  I guess the readback does more than just
> > make sure the device has seen the write, ensuring that the device has
> > finished the transaction to the point of acting on another one.
> 
> Agree.
> 
> > 
> > The data dependency plus isync sequence, which is done by the normal I/O
> > accessors used from C code, orders the readback versus all future
> > instructions (not just I/O).  The delay loop is not I/O.
> 
> According to the PowerISA, the sequence 'load, date dependency, isync' only
> order the load accesses. 

The point is to order the delay loop after the load, not to order
storage versus storage.

This is a sequence we're already using on all of our I/O loads
(excluding accesses like in this patch that don't use the standard
accessors).  I'm confident that it works even if it's not
architecturally guaranteed.  I'm not sure that there exists a clear
architectural way of synchronizing non-storage instructions relative to
storage instructions.

Given that isync is documented as preventing any execution of
instructions after the isync until all previous instructions complete,
it doesn't seem to make sense for the architecture to explicitly talk
about loads (as opposed to any other instruction) following a load,
dependent conditional branch, isync sequence.

> So if we want to order all the storage access as well
> as execution synchronization, we should choose sync here.

Do we need execution synchronization or context synchronization?

The t4240 RM section that talks about a readback and a sync is in the
context of subsequent memory operations ("Then accesses can safely be
made to memory regions affected..."), not arbitrary instructions.  There
are also a couple other places in the RM where isync is recommended
instead (when setting LAWs or CCSRBAR), even though those also only
involve memory accesses.

In any case, this is not performance critical and thus it's better to
oversynchronize than undersynchronize.

-Scott

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [ovs-dev] [PATCH] openvswitch: Orphan frags before sending to userspace via Netlink to avoid guest stall

2014-03-14 Thread Zoltan Kiss


On 11/03/14 19:41, Zoltan Kiss wrote:

On 07/03/14 17:59, Thomas Graf wrote:

On 03/07/2014 06:28 PM, Pravin Shelar wrote:

Problem is mapping SKBTX_DEV_ZEROCOPY pages to userspace. skb_zerocopy
is not doing that.

Unless I missing something, Current netlink code can not handle
skb-frags with zero copy. So we have to copy skb anyways and no need
to orphan-frags here.
If you are planning on handling skb-frags without copying then
skb_orphan_frags should be done in netlink.


If you look at the second part of skb_zerocopy() this is exactly what
it is doing unless the target skb has sufficient linear space
preallocated. At least unless mmap is enabled in which case we would
have to copy again until we have implemented a way to pass page refs
via the nl ring buffer.

So I think Zoltan is correct in orphaning frags that come from f.e.
a tun device via zerocopy_sg_from_iovec().


Now as I'm checking how Netlink works, I might be wrong at some parts :)
skb_zerocopy correctly add the frags to the user_skb we are sending
upwards, however when the userspace receive it in netlink_recvmsg(), it
gets copied to the supplied buffer anyway. Is that correct? In which
case we don't need to worry that userspace will sit on that page
indefinitely. However we have to worry about userspace not calling recv
on that Netlink socket, so in the end we still need skb_orphan_frags,
just for a different reason :)
We can put skb_orphan_frags into skb_zerocopy, skb_clone also do that.

However with Netlink mmapped IO, we should take a different approach,
and instead of calling skb_orphan_frags we should make sure user_skb can
hold any skb we get from the kernel, and copy the frags there. Even if
we would be able to pass page refs to userspace through the ring buffer
(AFAIK currently we can't), it would be fragile to just pass kernel
pages directly to userspace, even if they came without the
SKBTX_DEV_ZEROCOPY flag. And I think it would be quite rare that we need
that copy anyway, because the flow setup usually happens with small
packets without frags.
If we choose the above approach with Netlink mmap, we don't need
skb_orphan_frags, in fact


I spent some time to think about this mmaped scenaria, and discussed it 
with others: the conclusion is that it shouldn't be a big problem to 
expose local kernel pages through the frags array as I thought before. 
So OVS can get along with passing refs to those pages in the shared 
ring. However skb_orphan_frags would be still necessary in skb_zerocopy, 
for the same reason as now.
Should I post a new patch which does calls orphan_frags in zerocopy? Or 
do you have any other opinion?


Zoli

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/4] sched/rt: Sum number of all children tasks in hierarhy at rt_nr_running

2014-03-14 Thread Kirill Tkhai

{inc,dec}_rt_tasks used to count entities which are directly queued
on rt_rq. If an entity was not a task (i.e., it is some queue), its
children were not counted.

There is no problem here, but now we want to count number of all tasks
which are actually queued under the rt_rq in all the hierarhy (except
throttled rt queues).

Empty queues are not able to be queued and all of the places, which
use rt_nr_running, just compare it with zero, so we do not break
anything here.

Signed-off-by: Kirill Tkhai 
CC: Peter Zijlstra 
CC: Ingo Molnar 
---
 kernel/sched/rt.c |   15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d8cdf16..e4def13 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1045,12 +1045,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct 
rt_rq *rt_rq) {}
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 static inline
+unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
+{
+   struct rt_rq *group_rq = group_rt_rq(rt_se);
+
+   if (group_rq)
+   return group_rq->rt_nr_running;
+   else
+   return 1;
+}
+
+static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
int prio = rt_se_prio(rt_se);
 
WARN_ON(!rt_prio(prio));
-   rt_rq->rt_nr_running++;
+   rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
 
inc_rt_prio(rt_rq, prio);
inc_rt_migration(rt_se, rt_rq);
@@ -1062,7 +1073,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct 
rt_rq *rt_rq)
 {
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
-   rt_rq->rt_nr_running--;
+   rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
 
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
dec_rt_migration(rt_se, rt_rq);




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/4] sched: Revert commit 4c6c4e38c4e9

2014-03-14 Thread Kirill Tkhai

This reverts commit 4c6c4e38c4e9 [sched/core: Fix endless loop in
pick_next_task()], which is not necessary after [sched/rt: Substract number
of tasks of throttled queues from rq->nr_running]

Signed-off-by: Kirill Tkhai 
CC: Peter Zijlstra 
CC: Ingo Molnar 
---
 kernel/sched/fair.c  |4 +---
 kernel/sched/rt.c|   10 ++
 kernel/sched/sched.h |   12 
 3 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7e9bd0b..0d39ef7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6726,9 +6726,7 @@ static int idle_balance(struct rq *this_rq)
 
 out:
/* Is there a task of a high priority class? */
-   if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
-   (this_rq->dl.dl_nr_running ||
-(this_rq->rt.rt_nr_running && !rt_rq_throttled(_rq->rt
+   if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
 
if (pulled_task) {
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c961350..ec0933e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -493,6 +493,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
dequeue_rt_entity(rt_se);
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+   return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
 static int rt_se_boosted(struct sched_rt_entity *rt_se)
 {
struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -569,6 +574,11 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
dequeue_top_rt_rq(rt_rq);
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+   return rt_rq->rt_throttled;
+}
+
 static inline const struct cpumask *sched_rt_period_mask(void)
 {
return cpu_online_mask;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8327b4e..e8493b4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -425,18 +425,6 @@ struct rt_rq {
 #endif
 };
 
-#ifdef CONFIG_RT_GROUP_SCHED
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-   return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
-}
-#else
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-   return rt_rq->rt_throttled;
-}
-#endif
-
 /* Deadline class' related fields in a runqueue */
 struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/4] sched/rt: Substract number of tasks of throttled queues from rq->nr_running

2014-03-14 Thread Kirill Tkhai

Now rq->rt becomes to be able to be in dequeued or enqueued state.
We add new member rt_rq->rt_queued, which is used to indicate this.
The member is used only for top queue rq->rt_rq.

The goal is to fit generic scheme which is used in deadline and
fair classes, i.e. throttled rt_rq's rt_nr_running is beeing
substracted from rq->nr_running.

Signed-off-by: Kirill Tkhai 
CC: Peter Zijlstra 
CC: Ingo Molnar 
---
 kernel/sched/rt.c|   73 ++
 kernel/sched/sched.h |2 +
 2 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 93810d2..c961350 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->overloaded = 0;
plist_head_init(_rq->pushable_tasks);
 #endif
+   /* We start is dequeued state, because no RT tasks are queued */
+   rt_rq->rt_queued = 0;
 
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
@@ -404,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)
 }
 #endif /* CONFIG_SMP */
 
+static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
return !list_empty(_se->run_list);
@@ -465,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
 
if (rt_rq->rt_nr_running) {
-   if (rt_se && !on_rt_rq(rt_se))
+   if (!rt_se)
+   enqueue_top_rt_rq(rt_rq);
+   else if (!on_rt_rq(rt_se))
enqueue_rt_entity(rt_se, false);
+
if (rt_rq->highest_prio.curr < curr->prio)
resched_task(curr);
}
@@ -479,7 +487,9 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 
rt_se = rt_rq->tg->rt_se[cpu];
 
-   if (rt_se && on_rt_rq(rt_se))
+   if (!rt_se)
+   dequeue_top_rt_rq(rt_rq);
+   else if (on_rt_rq(rt_se))
dequeue_rt_entity(rt_se);
 }
 
@@ -545,12 +555,18 @@ static inline struct rt_rq *group_rt_rq(struct 
sched_rt_entity *rt_se)
 
 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
-   if (rt_rq->rt_nr_running)
-   resched_task(rq_of_rt_rq(rt_rq)->curr);
+   struct rq *rq = rq_of_rt_rq(rt_rq);
+
+   if (!rt_rq->rt_nr_running)
+   return;
+
+   enqueue_top_rt_rq(rt_rq);
+   resched_task(rq->curr);
 }
 
 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
+   dequeue_top_rt_rq(rt_rq);
 }
 
 static inline const struct cpumask *sched_rt_period_mask(void)
@@ -935,6 +951,38 @@ static void update_curr_rt(struct rq *rq)
}
 }
 
+static void
+dequeue_top_rt_rq(struct rt_rq *rt_rq)
+{
+   struct rq *rq = rq_of_rt_rq(rt_rq);
+
+   BUG_ON(>rt != rt_rq);
+
+   if (!rt_rq->rt_queued)
+   return;
+
+   BUG_ON(!rq->nr_running);
+
+   rq->nr_running -= rt_rq->rt_nr_running;
+   rt_rq->rt_queued = 0;
+}
+
+static void
+enqueue_top_rt_rq(struct rt_rq *rt_rq)
+{
+   struct rq *rq = rq_of_rt_rq(rt_rq);
+
+   BUG_ON(>rt != rt_rq);
+
+   if (rt_rq->rt_queued)
+   return;
+   if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+   return;
+
+   rq->nr_running += rt_rq->rt_nr_running;
+   rt_rq->rt_queued = 1;
+}
+
 #if defined CONFIG_SMP
 
 static void
@@ -1143,6 +1191,8 @@ static void dequeue_rt_stack(struct sched_rt_entity 
*rt_se)
back = rt_se;
}
 
+   dequeue_top_rt_rq(rt_rq_of_se(back));
+
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se);
@@ -1151,13 +1201,18 @@ static void dequeue_rt_stack(struct sched_rt_entity 
*rt_se)
 
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
+   struct rq *rq = rq_of_rt_se(rt_se);
+
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se, head);
+   enqueue_top_rt_rq(>rt);
 }
 
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 {
+   struct rq *rq = rq_of_rt_se(rt_se);
+
dequeue_rt_stack(rt_se);
 
for_each_sched_rt_entity(rt_se) {
@@ -1166,6 +1221,7 @@ static void dequeue_rt_entity(struct sched_rt_entity 
*rt_se)
if (rt_rq && rt_rq->rt_nr_running)
__enqueue_rt_entity(rt_se, false);
}
+   enqueue_top_rt_rq(>rt);
 }
 
 /*
@@ -1183,8 +1239,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int 
flags)
 
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
-
-   inc_nr_running(rq);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1195,8 +1249,6 @@ static void dequeue_task_rt(struct

[PATCH 2/4] sched/rt: Add accessors rq_of_rt_se()

2014-03-14 Thread Kirill Tkhai

Two accessors for RT_GROUP_SCHED and !RT_GROUP_SCHED cases.

Signed-off-by: Kirill Tkhai 
CC: Peter Zijlstra 
CC: Ingo Molnar 
---
 kernel/sched/rt.c |   17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e4def13..93810d2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -112,6 +112,13 @@ static inline struct rt_rq *rt_rq_of_se(struct 
sched_rt_entity *rt_se)
return rt_se->rt_rq;
 }
 
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+   struct rt_rq *rt_rq = rt_se->rt_rq;
+
+   return rt_rq->rq;
+}
+
 void free_rt_sched_group(struct task_group *tg)
 {
int i;
@@ -211,10 +218,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
return container_of(rt_rq, struct rq, rt);
 }
 
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 {
struct task_struct *p = rt_task_of(rt_se);
-   struct rq *rq = task_rq(p);
+
+   return task_rq(p);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+   struct rq *rq = rq_of_rt_se(rt_se);
 
return >rt;
 }




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1216 matches

Mail list logo