date:20130605

[PATCH] perf: Update event buffer tail when overwriting old events

2013-06-05 Thread Yan, Zheng

From: "Yan, Zheng" 

If perf event buffer is in overwrite mode, the kernel only updates
the data head when it overwrites old samples. The program that owns
the buffer need periodically check the buffer and update a variable
that tracks the date tail. If the program fails to do this in time,
the data tail can be overwritted by new samples. The program has to
rewind the buffer because it does not know where is the first vaild
sample.

This patch makes the kernel update the date tail when it overwrites
old events. So the program that owns the event buffer can always
read the latest samples. This is convenient for programs that use
perf to do branch tracing. One use case is GDB branch tracing:
(http://sourceware.org/ml/gdb-patches/2012-06/msg00172.html)
It uses perf interface to read BTS, but only cares the branches
before the ptrace event.

I added code to perf_output_{begin/end} to count how many cycles
are spent by sample output, then ran "perf record" to profile kernel
compilation 10 times on IvyBridge-EP. (perf record -a make -j 60)
The first number is scaled to 1000, the rest numbers are scaled by
the same factor.

before   overwrite mode  after   overwrite mode
AVG  1000999 10461044
STDEV19.419.517.117.9

Signed-off-by: Yan, Zheng 
---
 kernel/events/internal.h|  2 ++
 kernel/events/ring_buffer.c | 74 -
 2 files changed, 42 insertions(+), 34 deletions(-)

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index eb675c4..c6d7539 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -20,6 +20,8 @@ struct ring_buffer {
 
atomic_tpoll;   /* POLL_ for wakeups */
 
+   local_t tail;   /* read position */
+   local_t next_tail;  /* next read position */
local_t head;   /* write position*/
local_t nest;   /* nested writers*/
local_t events; /* event limit   */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index cd55144..2d5b15e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -15,28 +15,9 @@
 
 #include "internal.h"
 
-static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
- unsigned long offset, unsigned long head)
+static bool perf_output_space(unsigned long tail, unsigned long offset,
+ unsigned long head, unsigned long mask)
 {
-   unsigned long sz = perf_data_size(rb);
-   unsigned long mask = sz - 1;
-
-   /*
-* check if user-writable
-* overwrite : over-write its own tail
-* !overwrite: buffer possibly drops events.
-*/
-   if (rb->overwrite)
-   return true;
-
-   /*
-* verify that payload is not bigger than buffer
-* otherwise masking logic may fail to detect
-* the "not enough space" condition
-*/
-   if ((head - offset) > sz)
-   return false;
-
offset = (offset - tail) & mask;
head   = (head   - tail) & mask;
 
@@ -113,7 +94,7 @@ int perf_output_begin(struct perf_output_handle *handle,
  struct perf_event *event, unsigned int size)
 {
struct ring_buffer *rb;
-   unsigned long tail, offset, head;
+   unsigned long tail, offset, head, max_size;
int have_lost;
struct perf_sample_data sample_data;
struct {
@@ -136,7 +117,8 @@ int perf_output_begin(struct perf_output_handle *handle,
handle->rb  = rb;
handle->event   = event;
 
-   if (!rb->nr_pages)
+   max_size = perf_data_size(rb);
+   if (size > max_size)
goto out;
 
have_lost = local_read(>lost);
@@ -149,19 +131,43 @@ int perf_output_begin(struct perf_output_handle *handle,
 
perf_output_get_handle(handle);
 
-   do {
+   if (rb->overwrite) {
+   do {
+   tail = local_read(>tail);
+   offset = local_read(>head);
+   head = offset + size;
+   if (unlikely(!perf_output_space(tail, offset, head,
+   max_size - 1))) {
+   tail = local_read(>next_tail);
+   local_set(>tail, tail);
+   rb->user_page->data_tail = tail;
+   }
+   } while (local_cmpxchg(>head, offset, head) != offset);
+
/*
-* Userspace could choose to issue a mb() before updating the
-* tail pointer. So that all reads will be completed before the
-* write is issued.
+* Save the start

Re: [PATCH] net: Unbreak compat_sys_{send,recv}msg

2013-06-05 Thread Michael Neuling

Andy Lutomirski  wrote:

> I broke them in this commit:
> 
> commit 1be374a0518a288147c6a7398792583200a67261
> Author: Andy Lutomirski 
> Date:   Wed May 22 14:07:44 2013 -0700
> 
> net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg
> 
> This patch adds __sys_sendmsg and __sys_sendmsg as common helpers that accept
> MSG_CMSG_COMPAT and blocks MSG_CMSG_COMPAT at the syscall entrypoints.  It
> also reverts some unnecessary checks in sys_socketcall.
> 
> Apparently I was suffering from underscore blindness the first time around.

FWIW This fixes the problem I was seeing with powerpc 32bit user on 64
bit kernel.

Mikey

> 
> Signed-off-by: Andy Lutomirski 
> ---
> 
> I've tested this a little, but I'm not sure I have a great test case.
> 
> If the decision is that it's better to leave this for the 3.11, I can send
> a squashed version.  Note that the oops that this fixes is only an oops if
> the other patches in the original series are applied.
> 
> (FWIW, I wasn't sure how to submit this stuff in the first place.  I submitted
> some kernel hardening patches for the x86 tree that converted an access_ok
> oddity in the net code into an actual oops.  In a bit of looking, I couldn't
> find any failure mode other than a -EFAULT return without the other patches
> applied.  This was clear in the patch series description but not in the
> change log message for the net part.)
> 
>  include/linux/socket.h |  3 +++
>  net/compat.c   | 13 +++--
>  net/socket.c   | 72 
> +++---
>  3 files changed, 47 insertions(+), 41 deletions(-)
> 
> diff --git a/include/linux/socket.h b/include/linux/socket.h
> index 2b9f74b..e897bdc 100644
> --- a/include/linux/socket.h
> +++ b/include/linux/socket.h
> @@ -321,6 +321,9 @@ extern int put_cmsg(struct msghdr*, int level, int type, 
> int len, void *data);
>  
>  struct timespec;
>  
> +/* The __sys_...msg variants allow MSG_CMSG_COMPAT */
> +extern long __sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags);
> +extern long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags);
>  extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int 
> vlen,
> unsigned int flags, struct timespec *timeout);
>  extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
> diff --git a/net/compat.c b/net/compat.c
> index 79ae884..f0a1ba6 100644
> --- a/net/compat.c
> +++ b/net/compat.c
> @@ -734,19 +734,25 @@ static unsigned char nas[21] = {
>  
>  asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, 
> unsigned int flags)
>  {
> - return sys_sendmsg(fd, (struct msghdr __user *)msg, flags | 
> MSG_CMSG_COMPAT);
> + if (flags & MSG_CMSG_COMPAT)
> + return -EINVAL;
> + return __sys_sendmsg(fd, (struct msghdr __user *)msg, flags | 
> MSG_CMSG_COMPAT);
>  }
>  
>  asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user 
> *mmsg,
>   unsigned int vlen, unsigned int flags)
>  {
> + if (flags & MSG_CMSG_COMPAT)
> + return -EINVAL;
>   return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
> flags | MSG_CMSG_COMPAT);
>  }
>  
>  asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, 
> unsigned int flags)
>  {
> - return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | 
> MSG_CMSG_COMPAT);
> + if (flags & MSG_CMSG_COMPAT)
> + return -EINVAL;
> + return __sys_recvmsg(fd, (struct msghdr __user *)msg, flags | 
> MSG_CMSG_COMPAT);
>  }
>  
>  asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, 
> unsigned int flags)
> @@ -768,6 +774,9 @@ asmlinkage long compat_sys_recvmmsg(int fd, struct 
> compat_mmsghdr __user *mmsg,
>   int datagrams;
>   struct timespec ktspec;
>  
> + if (flags & MSG_CMSG_COMPAT)
> + return -EINVAL;
> +
>   if (COMPAT_USE_64BIT_TIME)
>   return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
> flags | MSG_CMSG_COMPAT,
> diff --git a/net/socket.c b/net/socket.c
> index 0e16888..e216502 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -1978,7 +1978,7 @@ struct used_address {
>   unsigned int name_len;
>  };
>  
> -static int __sys_sendmsg(struct socket *sock, struct msghdr __user *msg,
> +static int ___sys_sendmsg(struct socket *sock, struct msghdr __user *msg,
>struct msghdr *msg_sys, unsigned int flags,
>struct used_address *used_address)
>  {
> @@ -2093,26 +2093,30 @@ out:
>   *   BSD sendmsg interface
>   */
>  
> -SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned int, 
> flags)
> +long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags)
>  {
>   int fput_needed, err;
>   struct msghdr msg_sys;
>   struct socket *sock;
>  
> - if

Re: [PATCH] powerpc/pci: Improve device hotplug initialization

2013-06-05 Thread Benjamin Herrenschmidt

On Wed, 2013-06-05 at 22:25 -0700, Guenter Roeck wrote:
> 
> 
> Can you point me to some of the breaking code ? I guess it must be in some of
> the pci_dma_dev_setup callbacks, but those I looked at only check devicetree
> data or simply set function pointers, both of which should not be affected by
> the call order.
> 
> How about pcibios_fixup_device, to be called after pcibios_fixup_bus ?

Mostly the pseries ones from a cursory glance. They could be improved to
be agnostic to the call order I suppose or simply done better but heh...

I like using pcibios_add though ... we just need to figure out how to
sort out that ordering if possible.

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] ARM: dts: bcm281xx: use #include for device tree files

2013-06-05 Thread Matt Porter

Replace /include/ by #include for bcm281xx device tree
files, enabling use of the C preprocessor.

Signed-off-by: Matt Porter 
---
 arch/arm/boot/dts/bcm11351-brt.dts |2 +-
 arch/arm/boot/dts/bcm11351.dtsi|2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/bcm11351-brt.dts 
b/arch/arm/boot/dts/bcm11351-brt.dts
index 248067c..3392f64 100644
--- a/arch/arm/boot/dts/bcm11351-brt.dts
+++ b/arch/arm/boot/dts/bcm11351-brt.dts
@@ -13,7 +13,7 @@
 
 /dts-v1/;
 
-/include/ "bcm11351.dtsi"
+#include "bcm11351.dtsi"
 
 / {
model = "BCM11351 BRT board";
diff --git a/arch/arm/boot/dts/bcm11351.dtsi b/arch/arm/boot/dts/bcm11351.dtsi
index 41b2c6c..c08810e 100644
--- a/arch/arm/boot/dts/bcm11351.dtsi
+++ b/arch/arm/boot/dts/bcm11351.dtsi
@@ -11,7 +11,7 @@
  * GNU General Public License for more details.
  */
 
-/include/ "skeleton.dtsi"
+#include "skeleton.dtsi"
 
 / {
model = "BCM11351 SoC";
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Modify UEFI anti-bricking code

2013-06-05 Thread joeyli

於 四，2013-06-06 於 13:05 +0800，joeyli 提到：
> 於 三，2013-06-05 於 16:59 +0100，Matt Fleming 提到：
> > On Wed, 05 Jun, at 02:53:27PM, Matthew Garrett wrote:
> > > On Wed, 2013-06-05 at 15:49 +0100, Fleming, Matt wrote:
> > > 
> > > > Folks, what do you want me to do with this? Merge it with Matthew's 
> > > > patch?
> > > 
> > > Do that and add Joey's signed-off-by?
> > 
> > Right, this is what I've got queued up.
> > 
> > ---
> > 
> > >From 380dcc12ba82f4e10feb6a72207b2e4771d16d8d Mon Sep 17 00:00:00 2001
> > From: Matthew Garrett 
> > Date: Sat, 1 Jun 2013 16:06:20 -0400
> > Subject: [PATCH] Modify UEFI anti-bricking code
> > 
> > This patch reworks the UEFI anti-bricking code, including an effective
> > reversion of cc5a080c and 31ff2f20. It turns out that calling
> > QueryVariableInfo() from boot services results in some firmware
> > implementations jumping to physical addresses even after entering virtual
> > mode, so until we have 1:1 mappings for UEFI runtime space this isn't
> > going to work so well.
> [...]
> 
> The follow diff change is base on 380dcc12 patch queued in efi git tree,
> it included Matthew and hpa's suggestions. I fix the attributes of DUMMY
> object to NV/BS/RT and introduced a #define of the minimum reserve flash
> space.
> 
> This change works to me on OVMF.
> 
> 
> 
> Thanks a lot!
> Joey Lee
> 

Sorry for attached a wrong diff result, it lost a NV/BS/RT attributes
changed in efi_query_variable_store(). The right diff change is
following.


Thanks a lot!
Joey Lee


diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index cc3cfe8..ec8ac97 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -53,6 +53,8 @@
 
 #define EFI_DEBUG  1
 
+#define EFI_MIN_RESERVE 5120
+
 #define EFI_DUMMY_GUID \
EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 
0x9f, 0x92, 0xa9)
 
@@ -988,7 +990,11 @@ void __init efi_enter_virtual_mode(void)
kfree(new_memmap);
 
/* clean DUMMY object */
-   efi.set_variable(efi_dummy_name, _DUMMY_GUID, 0, 0, NULL);
+   efi.set_variable(efi_dummy_name, _DUMMY_GUID,
+EFI_VARIABLE_NON_VOLATILE |
+EFI_VARIABLE_BOOTSERVICE_ACCESS |
+EFI_VARIABLE_RUNTIME_ACCESS,
+0, NULL);
 }
 
 /*
@@ -1051,7 +1057,12 @@ efi_status_t efi_query_variable_store(u32 attributes, 
unsigned long size)
 * write if permitting it would reduce the available space to under
 * 5KB. This figure was provided by Samsung, so should be safe.
 */
-   if ((remaining_size - size < 5120) && !efi_no_storage_paranoia) {
+   if ((remaining_size - size < EFI_MIN_RESERVE) &&
+   !efi_no_storage_paranoia) {
+
+   if (!(attributes & EFI_VARIABLE_NON_VOLATILE))
+   return EFI_OUT_OF_RESOURCES;
+
/*
 * Triggering garbage collection may require that the firmware
 * generate a real EFI_OUT_OF_RESOURCES error. We can force
@@ -1061,7 +1072,10 @@ efi_status_t efi_query_variable_store(u32 attributes, 
unsigned long size)
void *dummy = kmalloc(dummy_size, GFP_ATOMIC);
 
status = efi.set_variable(efi_dummy_name, _DUMMY_GUID,
- attributes, dummy_size, dummy);
+ EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS,
+ dummy_size, dummy);
 
if (status == EFI_SUCCESS) {
/*
@@ -1069,7 +1083,10 @@ efi_status_t efi_query_variable_store(u32 attributes, 
unsigned long size)
 * that we delete it...
 */
efi.set_variable(efi_dummy_name, _DUMMY_GUID,
-attributes, 0, dummy);
+EFI_VARIABLE_NON_VOLATILE |
+EFI_VARIABLE_BOOTSERVICE_ACCESS |
+EFI_VARIABLE_RUNTIME_ACCESS,
+0, dummy);
}
 
/*
@@ -1085,7 +1102,7 @@ efi_status_t efi_query_variable_store(u32 attributes, 
unsigned long size)
/*
 * There still isn't enough room, so return an error
 */
-   if (remaining_size - size < 5120)
+   if (remaining_size - size < EFI_MIN_RESERVE)
return EFI_OUT_OF_RESOURCES;
}
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Modify UEFI anti-bricking code

2013-06-05 Thread Matthew Garrett

On Thu, 2013-06-06 at 13:05 +0800, joeyli wrote:

> + if (!(attributes & EFI_VARIABLE_NON_VOLATILE))
> + return EFI_OUT_OF_RESOURCES;

I'd move this up to the top of the function, and just return 0 - there's
no risk of the firmware causing problems if it's a volatile variable, so
we should probably just pass it down to the firmware and return an error
from there.

-- 
Matthew Garrett | mj...@srcf.ucam.org

[PATCH 0/2] ARM dts: bcm281xx: use preprocessor for device trees

2013-06-05 Thread Matt Porter

This series follows the same approach as taken on Tegra and
OMAP DT files to use the C preprocessor in order to improve
readability. Since bcm281xx does not yet have gpio and pinctrl
support, this series just enables the C preprocessor and removes
the magic constants from existing irq properties. Any new
additions can come in using human readable definitions.

The resulting dtb was diff-tested to validate it versus the non
preprocessed dts version.

Matt Porter (2):
  ARM: dts: bcm281xx: use #include for device tree files
  ARM: dts: bcm281xx: use existing defines for irqs

 arch/arm/boot/dts/bcm11351-brt.dts |2 +-
 arch/arm/boot/dts/bcm11351.dtsi|9 ++---
 2 files changed, 7 insertions(+), 4 deletions(-)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] ARM: dts: bcm281xx: use existing defines for irqs

2013-06-05 Thread Matt Porter

Use the standard interrupt-controller and ARM GIC constants to
improve the readability of bcm281xx DT irq properties.

Signed-off-by: Matt Porter 
---
 arch/arm/boot/dts/bcm11351.dtsi |7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/bcm11351.dtsi b/arch/arm/boot/dts/bcm11351.dtsi
index c08810e..824bebe 100644
--- a/arch/arm/boot/dts/bcm11351.dtsi
+++ b/arch/arm/boot/dts/bcm11351.dtsi
@@ -11,6 +11,9 @@
  * GNU General Public License for more details.
  */
 
+#include 
+#include 
+
 #include "skeleton.dtsi"
 
 / {
@@ -41,7 +44,7 @@
status = "disabled";
reg = <0x3e00 0x1000>;
clock-frequency = <1300>;
-   interrupts = <0x0 67 0x4>;
+   interrupts = ;
reg-shift = <2>;
reg-io-width = <4>;
};
@@ -56,7 +59,7 @@
timer@35006000 {
compatible = "bcm,kona-timer";
reg = <0x35006000 0x1000>;
-   interrupts = <0x0 7 0x4>;
+   interrupts = ;
clock-frequency = <32768>;
};
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] net: Unbreak compat_sys_{send,recv}msg

2013-06-05 Thread Andy Lutomirski

I broke them in this commit:

commit 1be374a0518a288147c6a7398792583200a67261
Author: Andy Lutomirski 
Date:   Wed May 22 14:07:44 2013 -0700

net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg

This patch adds __sys_sendmsg and __sys_sendmsg as common helpers that accept
MSG_CMSG_COMPAT and blocks MSG_CMSG_COMPAT at the syscall entrypoints.  It
also reverts some unnecessary checks in sys_socketcall.

Apparently I was suffering from underscore blindness the first time around.

Signed-off-by: Andy Lutomirski 
---

I've tested this a little, but I'm not sure I have a great test case.

If the decision is that it's better to leave this for the 3.11, I can send
a squashed version.  Note that the oops that this fixes is only an oops if
the other patches in the original series are applied.

(FWIW, I wasn't sure how to submit this stuff in the first place.  I submitted
some kernel hardening patches for the x86 tree that converted an access_ok
oddity in the net code into an actual oops.  In a bit of looking, I couldn't
find any failure mode other than a -EFAULT return without the other patches
applied.  This was clear in the patch series description but not in the
change log message for the net part.)

 include/linux/socket.h |  3 +++
 net/compat.c   | 13 +++--
 net/socket.c   | 72 +++---
 3 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 2b9f74b..e897bdc 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -321,6 +321,9 @@ extern int put_cmsg(struct msghdr*, int level, int type, 
int len, void *data);
 
 struct timespec;
 
+/* The __sys_...msg variants allow MSG_CMSG_COMPAT */
+extern long __sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags);
+extern long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags);
 extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int 
vlen,
  unsigned int flags, struct timespec *timeout);
 extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
diff --git a/net/compat.c b/net/compat.c
index 79ae884..f0a1ba6 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -734,19 +734,25 @@ static unsigned char nas[21] = {
 
 asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, 
unsigned int flags)
 {
-   return sys_sendmsg(fd, (struct msghdr __user *)msg, flags | 
MSG_CMSG_COMPAT);
+   if (flags & MSG_CMSG_COMPAT)
+   return -EINVAL;
+   return __sys_sendmsg(fd, (struct msghdr __user *)msg, flags | 
MSG_CMSG_COMPAT);
 }
 
 asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg,
unsigned int vlen, unsigned int flags)
 {
+   if (flags & MSG_CMSG_COMPAT)
+   return -EINVAL;
return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
  flags | MSG_CMSG_COMPAT);
 }
 
 asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, 
unsigned int flags)
 {
-   return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | 
MSG_CMSG_COMPAT);
+   if (flags & MSG_CMSG_COMPAT)
+   return -EINVAL;
+   return __sys_recvmsg(fd, (struct msghdr __user *)msg, flags | 
MSG_CMSG_COMPAT);
 }
 
 asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, unsigned 
int flags)
@@ -768,6 +774,9 @@ asmlinkage long compat_sys_recvmmsg(int fd, struct 
compat_mmsghdr __user *mmsg,
int datagrams;
struct timespec ktspec;
 
+   if (flags & MSG_CMSG_COMPAT)
+   return -EINVAL;
+
if (COMPAT_USE_64BIT_TIME)
return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
  flags | MSG_CMSG_COMPAT,
diff --git a/net/socket.c b/net/socket.c
index 0e16888..e216502 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1978,7 +1978,7 @@ struct used_address {
unsigned int name_len;
 };
 
-static int __sys_sendmsg(struct socket *sock, struct msghdr __user *msg,
+static int ___sys_sendmsg(struct socket *sock, struct msghdr __user *msg,
 struct msghdr *msg_sys, unsigned int flags,
 struct used_address *used_address)
 {
@@ -2093,26 +2093,30 @@ out:
  * BSD sendmsg interface
  */
 
-SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned int, 
flags)
+long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags)
 {
int fput_needed, err;
struct msghdr msg_sys;
struct socket *sock;
 
-   if (flags & MSG_CMSG_COMPAT)
-   return -EINVAL;
-
sock = sockfd_lookup_light(fd, , _needed);
if (!sock)
goto out;
 
-   err = __sys_sendmsg(sock, msg, _sys, flags, NULL);
+   err = ___sys_sendmsg(sock, msg, _sys, flags, NULL);
 
fput_light(sock->file,

linux-next: manual merge of the mvebu tree with the imx-mxs tree

2013-06-05 Thread Stephen Rothwell

Hi all,

Today's linux-next merge of the mvebu tree got a conflict in
arch/arm/Kconfig.debug between commit 284166ffebc3 ("ARM: imx: enable
low-level debug support for imx6sl") from the imx-mxs tree and commit
5be22d8c29fd ("arm: mvebu: add another earlyprintk Kconfig option") from
the mvebu tree.

I fixed it up (see below) and can carry the fix as necessary (no action
is required).

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au

diff --cc arch/arm/Kconfig.debug
index 29f7623,e6a6ab1..000
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@@ -639,9 -656,9 +664,10 @@@ config DEBUG_LL_INCLUD
 DEBUG_IMX35_UART || \
 DEBUG_IMX51_UART || \
 DEBUG_IMX53_UART ||\
 -   DEBUG_IMX6Q_UART
 +   DEBUG_IMX6Q_UART || \
 +   DEBUG_IMX6SL_UART
-   default "debug/mvebu.S" if DEBUG_MVEBU_UART
+   default "debug/mvebu.S" if DEBUG_MVEBU_UART || \
+  DEBUG_MVEBU_UART_ALTERNATE
default "debug/mxs.S" if DEBUG_IMX23_UART || DEBUG_IMX28_UART
default "debug/nomadik.S" if DEBUG_NOMADIK_UART
default "debug/omap2plus.S" if DEBUG_OMAP2PLUS_UART


pgpPoi4arBZan.pgp
Description: PGP signature

Re: [patch 2/2] memcg: do not sleep on OOM waitqueue with full charge context

2013-06-05 Thread Johannes Weiner

On Wed, Jun 05, 2013 at 09:10:51PM -0700, David Rientjes wrote:
> On Wed, 5 Jun 2013, Johannes Weiner wrote:
> 
> > The memcg OOM handling is incredibly fragile because once a memcg goes
> > OOM, one task (kernel or userspace) is responsible for resolving the
> > situation.
> 
> Not sure what this means.  Are you referring to the case where the memcg 
> is disabled from oom killing and we're relying on a userspace handler and 
> it may be caught on the waitqueue itself?  Otherwise, are you referring to 
> the task as eventually being the only one that takes the hierarchy oom 
> lock and calls mem_cgroup_out_of_memory()?  I don't think you can make a 
> case for the latter since the one that calls mem_cgroup_out_of_memory() 
> should return and call memcg_wakeup_oom().  We obviously don't want to do 
> any memory allocations in this path.

If the killing task or one of the sleeping tasks is holding a lock
that the selected victim needs in order to exit no progress can be
made.

The report we had a few months ago was that a task held the i_mutex
when trying to charge a page cache page and then invoked the OOM
handler and looped on CHARGE_RETRY.  Meanwhile, the selected victim
was just entering truncate() and now stuck waiting for the i_mutex.

I'll add this scenario to the changelog, hopefully it will make the
rest a little clearer.

> > Every other task that gets caught trying to charge memory
> > gets stuck in a waitqueue while potentially holding various filesystem
> > and mm locks on which the OOM handling task may now deadlock.
> > 
> 
> What locks?  The oom killer quite extensively needs task_lock() but we 
> shouldn't be calling it in the case where we hold this since its a 
> spinlock and mem_cgroup_do_charge() never gets to the point of handling 
> the oom.
> 
> Again, are you referring only to a userspace handler here?

No.  The OOM path (does not matter if user task or kernel) is
currently entered from the charge context, which may hold filesystem
and mm locks (look who charges page cache pages e.g.) and they are not
released until the situation is resolved because the task either loops
inside __mem_cgroup_try_charge() on CHARGE_RETRY or is stuck in the
waitqueue.

And waiting for anybody else to make progress while holding mmap_sem,
i_mutex etc. is the problem.

> > Do two things:
> > 
> > 1. When OOMing in a system call (buffered IO and friends), invoke the
> >OOM killer but just return -ENOMEM, never sleep.  Userspace should
> >be able to handle this.
> > 
> 
> The sleep should never occur for any significant duration currently, the 
> current implementation ensures one process calls 
> mem_cgroup_out_of_memory() while all others sit on a waitqueue until that 
> process returns from the oom killer and then they all wakeup again so the 
> killed process may exit and all others can retry their allocations now 
> that something has been killed.  If that process hasn't exited yet, the 
> next process that locks the memcg oom hierarchy will see the killed 
> process with TIF_MEMDIE and the oom killer is deferred.  So this sleep is 
> very temporary already, I don't see why you're trying to make it faster 
> while another thread finds a candidate task, it works quite well already.

It's not the amount of time slept on average, it's that going to sleep
in this context may deadlock the killing or the killed task.  I don't
see where you read "making it faster", but I'm trying to make it just
slightly faster than a deadlock.

> > 2. When OOMing in a page fault and somebody else is handling the
> >situation, do not sleep directly in the charging code.  Instead,
> >remember the OOMing memcg in the task struct and then fully unwind
> >the page fault stack first before going to sleep.
> > 
> 
> Are you trying to address a situation here where the memcg oom handler 
> takes too long to work?  We've quite extensively tried to improve that, 
> especially by reducing its dependency on tasklist_lock which is contended 
> from the writeside in the exit path and by only iterating processes that 
> are attached to that memcg hierarchy and not all processes on the system 
> like before.  I don't see the issue with scheduling other oom processes 
> while one is doing mem_cgroup_out_of_memory().  (I would if the oom killer 
> required things like mm->mmap_sem, but obviously it doesn't for reasons 
> not related to memcg.)

I really don't see where you read "performance optimization" in this
changelog, the very first paragraph mentions "very fragile" and "may
deadlock".

> > While reworking the OOM routine, also remove a needless OOM waitqueue
> > wakeup when invoking the killer.  Only uncharges and limit increases,
> > things that actually change the memory situation, should do wakeups.
> > 
> 
> It's not needless at all, it's vitally required!  The oom killed process 
> needs to be removed from the waitqueue and scheduled now with TIF_MEMDIE 
> that the memcg oom killer provided so the allocation

[PATCH v3] mfd: DT bindings for the palmas family MFD

2013-06-05 Thread J Keerthy

Add the various binding files for the palmas family of chips. There is a
top level MFD binding then a seperate binding for regulators IP blocks on chips.

Signed-off-by: Graeme Gregory 
Signed-off-by: J Keerthy 
Signed-off-by: Ian Lartey 
Reviewed-by: Stephen Warren  
---
 Documentation/devicetree/bindings/mfd/palmas.txt   |   49 +
 .../devicetree/bindings/regulator/palmas-pmic.txt  |   72 
 2 files changed, 121 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/mfd/palmas.txt
 create mode 100644 Documentation/devicetree/bindings/regulator/palmas-pmic.txt

diff --git a/Documentation/devicetree/bindings/mfd/palmas.txt 
b/Documentation/devicetree/bindings/mfd/palmas.txt
new file mode 100644
index 000..7bcd59c
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/palmas.txt
@@ -0,0 +1,49 @@
+* palmas device tree bindings
+
+The TI palmas family current members :-
+twl6035 (palmas)
+twl6037 (palmas)
+tps65913 (palmas)
+tps65914 (palmas)
+
+Required properties:
+- compatible : Should be from the list
+  ti,twl6035
+  ti,twl6036
+  ti,twl6037
+  ti,tps65913
+  ti,tps65914
+  ti,tps80036
+and also the generic series names
+  ti,palmas
+- interrupt-controller : palmas has its own internal IRQs
+- #interrupt-cells : should be set to 2 for IRQ number and flags
+  The first cell is the IRQ number.
+  The second cell is the flags, encoded as the trigger masks from
+  Documentation/devicetree/bindings/interrupts.txt
+- interrupt-parent : The parent interrupt controller.
+
+Optional properties:
+  ti,mux-padX : set the pad register X (1-2) to the correct muxing for the
+   hardware, if not set will use muxing in OTP.
+
+Example:
+
+palmas {
+   compatible = "ti,twl6035", "ti,palmas";
+   reg = <0x48>
+   interrupt-parent = <>;
+   interrupt-controller;
+   #interrupt-cells = <2>;
+
+   ti,mux-pad1 = <0>;
+   ti,mux-pad2 = <0>;
+
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   pmic {
+   compatible = "ti,twl6035-pmic", "ti,palmas-pmic";
+   
+   };
+}
diff --git a/Documentation/devicetree/bindings/regulator/palmas-pmic.txt 
b/Documentation/devicetree/bindings/regulator/palmas-pmic.txt
new file mode 100644
index 000..d5a3086
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/palmas-pmic.txt
@@ -0,0 +1,72 @@
+* palmas regulator IP block devicetree bindings
+
+Required properties:
+- compatible : Should be from the list
+  ti,twl6035-pmic
+  ti,twl6036-pmic
+  ti,twl6037-pmic
+  ti,tps65913-pmic
+  ti,tps65914-pmic
+and also the generic series names
+  ti,palmas-pmic
+- interrupt-parent : The parent interrupt controller which is palmas.
+- interrupts : The interrupt number and the type which can be looked up here:
+  arch/arm/boot/dts/include/dt-bindings/interrupt-controller/irq.h
+- interrupts-name: The names of the individual interrupts.
+
+Optional properties:
+- ti,ldo6-vibrator : ldo6 is in vibrator mode
+
+Optional nodes:
+- regulators : Must contain a sub-node per regulator from the list below.
+  Each sub-node should contain the constraints and initialization
+  information for that regulator. See regulator.txt for a
+  description of standard properties for these sub-nodes.
+  Additional custom properties  are listed below.
+
+  For ti,palmas-pmic - smps12, smps123, smps3 depending on OTP,
+  smps45, smps457, smps7 depending on variant, smps6, smps[8-10],
+  ldo[1-9], ldoln, ldousb.
+
+  Optional sub-node properties:
+  ti,warm-reset - maintain voltage during warm reset(boolean)
+  ti,roof-floor - control voltage selection by pin(boolean)
+  ti,sleep-mode - mode to adopt in pmic sleep 0 - off, 1 - auto,
+  2 - eco, 3 - forced pwm
+  ti,tstep - slope control 0 - Jump, 1 10mV/us, 2 5mV/us, 3 
2.5mV/us
+  ti,smps-range - OTP has the wrong range set for the hardware so 
override
+  0 - low range, 1 - high range.
+
+Example:
+
+#include 
+
+pmic {
+   compatible = "ti,twl6035-pmic", "ti,palmas-pmic";
+   interrupt-parent = <>;
+   interrupts = <14 IRQ_TYPE_NONE>;
+   interrupts-name = "short-irq";
+
+   ti,ldo6-vibrator;
+
+   regulators {
+   smps12_reg : smps12 {
+   regulator-name = "smps12";
+   regulator-min-microvolt = < 60>;
+   regulator-max-microvolt = <150>;
+   regulator-always-on;
+   regulator-boot-on;
+   ti,warm-reset;
+   ti,roof-floor;
+   ti,mode-sleep = <0>;
+   ti,tstep = <0>;
+   ti,smps-range = <1>;
+   };
+
+   ldo1_reg: ldo1 {
+   regulator-name =

[git pull] m68knommu follow up fixes for 3.10

2013-06-05 Thread Greg Ungerer

Hi Linus,

Can you please pull the m68knommu git tree, for-linus branch.

It contains only a single fix for compilation breakage to many of the
ColdFire CPU targets.

Regards
Greg



The following changes since commit e4aa937ec75df0eea0bee03bffa3303ad36c986b:

  Linux 3.10-rc3 (2013-05-26 16:00:47 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/gerg/m68knommu.git for-linus

for you to fetch changes up to cf6c31fc5c3de225348742c95cc6185fca20a2f2:

  m68k: only use local gpio_request_one if not using GPIOLIB (2013-05-29 
16:56:45 +1000)


Greg Ungerer (1):
  m68k: only use local gpio_request_one if not using GPIOLIB

 arch/m68k/include/asm/gpio.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[git pull] drm fixes (try again)

2013-06-05 Thread Dave Airlie


Hi,

this time after I appended the right file.

regression fixers for the big 3:
nouveau: hdmi audio, dac load detect, s/r regressions fixed
radeon: long standing system hang fixed, hdmi audio and rs780 fast fb
fixes
intel: one old regression, a WARN removal, and a stop X dying fix

otherwise one mgag200 fix, a couple of arm build fixes, and a core use
after free fix.

Dave.

The following changes since commit d683b96b072dc4680fc74964eca77e6a23d1fa6e:

  Linux 3.10-rc4 (2013-06-02 17:11:17 +0900)

are available in the git repository at:

  git://people.freedesktop.org/~airlied/linux drm-fixes

for you to fetch changes up to 0e32fde96bb9c1fa8fa477e52c1d6ae2f4995cea:

  Merge branch 'drm-nouveau-fixes-3.10' of 
git://anongit.freedesktop.org/git/nouveau/linux-2.6 into drm-fixes (2013-06-05 
14:35:08 +1000)



Adis Hamzić (1):
  radeon: Fix system hang issue when using KMS with older cards

Alex Deucher (1):
  drm/radeon: don't allow audio on DCE6

Alexander Stein (1):
  drm/nv84/disp: Fix HDMI audio regression

Arnd Bergmann (2):
  drm/tilcd: select BACKLIGHT_LCD_SUPPORT
  drm/nouveau: use mdelay instead of large udelay constants

Ben Mesman (1):
  drm/i915: no lvds quirk for hp t5740

Ben Skeggs (3):
  drm/nv50-nv84/fifo: fix resume regression introduced by playlist race fix
  drm/nv50/disp: force dac power state during load detect
  drm/nv50/kms: use dac loadval from vbios, where it's available

Christopher Harvey (1):
  drm/mgag200: Add missing write to index before accessing data register

Daniel Vetter (2):
  drm/i915: Fix spurious -EIO/SIGBUS on wedged gpus
  drm/i915: Quirk the pipe A quirk in the modeset state checker

Dave Airlie (3):
  Merge branch 'drm-fixes-3.10' of 
git://people.freedesktop.org/~agd5f/linux into drm-fixes
  Merge tag 'drm-intel-fixes-2013-06-04' of 
git://people.freedesktop.org/~danvet/drm-intel into drm-fixes
  Merge branch 'drm-nouveau-fixes-3.10' of 
git://anongit.freedesktop.org/git/nouveau/linux-2.6 into drm-fixes

Egbert Eich (1):
  drm/i915/sdvo: Use _sdvo->ddc instead of intel_sdvo->i2c for DDC.

Huacai Chen (1):
  drm: fix a use-after-free when GPU acceleration disabled

Samuel Li (1):
  drm/radeon: Use direct mapping for fast fb access on RS780/RS880 (v2)

 drivers/gpu/drm/drm_irq.c  |  6 ++-
 drivers/gpu/drm/i915/i915_gem.c|  7 +--
 drivers/gpu/drm/i915/intel_display.c   |  5 ++
 drivers/gpu/drm/i915/intel_lvds.c  |  4 +-
 drivers/gpu/drm/i915/intel_sdvo.c  |  2 +-
 drivers/gpu/drm/mgag200/mgag200_mode.c |  9 ++--
 drivers/gpu/drm/nouveau/core/engine/disp/dacnv50.c |  7 ++-
 .../gpu/drm/nouveau/core/engine/disp/hdminv84.c|  4 ++
 drivers/gpu/drm/nouveau/core/engine/fifo/nv50.c| 14 --
 drivers/gpu/drm/nouveau/core/include/core/class.h  |  2 +-
 drivers/gpu/drm/nouveau/nv50_display.c |  4 +-
 drivers/gpu/drm/radeon/atombios_encoders.c | 11 +++--
 drivers/gpu/drm/radeon/evergreen.c | 10 ++--
 drivers/gpu/drm/radeon/ni.c| 10 ++--
 drivers/gpu/drm/radeon/r100.c  |  9 ++--
 drivers/gpu/drm/radeon/r300.c  |  9 ++--
 drivers/gpu/drm/radeon/r420.c  | 10 ++--
 drivers/gpu/drm/radeon/r520.c  |  9 ++--
 drivers/gpu/drm/radeon/r600.c  | 53 --
 drivers/gpu/drm/radeon/r600d.h |  8 
 drivers/gpu/drm/radeon/radeon_asic.c   |  4 ++
 drivers/gpu/drm/radeon/radeon_asic.h   |  2 +
 drivers/gpu/drm/radeon/rs400.c |  9 ++--
 drivers/gpu/drm/radeon/rs600.c |  9 ++--
 drivers/gpu/drm/radeon/rs690.c |  9 ++--
 drivers/gpu/drm/radeon/rv515.c |  9 ++--
 drivers/gpu/drm/radeon/rv770.c | 10 ++--
 drivers/gpu/drm/radeon/si.c| 10 ++--
 drivers/gpu/drm/tilcdc/Kconfig |  1 +
 29 files changed, 188 insertions(+), 68 deletions(-)

Re: [git pull] drm fixes

2013-06-05 Thread Dave Airlie

On Thu, Jun 6, 2013 at 3:22 PM, Linus Torvalds
 wrote:
> On Thu, Jun 6, 2013 at 2:14 PM, Dave Airlie  wrote:
>>
>>  7 files changed, 32 insertions(+), 42 deletions(-)
>
> That's not at all what I get (including shortlog). I got
>
>  29 files changed, 188 insertions(+), 68 deletions(-)
>
> from a lot of commits you don't list.

doh I slurped in the wrong file to my emailer.

Will resend.

Sorry,
Dave.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] powerpc/pci: Improve device hotplug initialization

2013-06-05 Thread Guenter Roeck

On Thu, Jun 06, 2013 at 11:00:04AM +1000, Benjamin Herrenschmidt wrote:
> On Sat, 2013-06-01 at 06:58 -0700, Guenter Roeck wrote:
> > the comment was actuially directed towards Yuanquan.
> > 
> > No problem, take your time. I did my best to test it, but I agree that this 
> > is a
> > critical area of the code, and it would be desirable to get additional 
> > scrutiny
> > and test feedback.
> > 
> > The code has been running in our system (P2020 and P5040) for several 
> > months.
> > I was preparing a patch for upstream submission when I noticed commit 
> > 37f02195b.
> > After testing ithis commit, I noticed the problems with it and wrote this 
> > patch,
> > which aligns the code with our initial patch. I tested it as good as I 
> > could on
> > our systems as well as with a P5040 evaluation board and an Intel GE PCIe
> > card.
> 
> Ok, so I like this very much. So much that I was considering still sneaking it
> into 3.10, until I hit a snag...
> 
> [ Basically, the previous patch that moved the setup to 
> pcibios_enable_device()
> always made me nervous. It did regress at least one platform (mac stuff) due
> to missed IRQ fixup, which I worked around later on, and I'm still not 
> terribly
> happy about it. Your approach is much cleaner. ]
> 
> I suppose that when I wrote the original setup stuff there wasn't an add
> hook or I didn't see it...
> 
> In fact I would go further and completely remove pcibios_setup_bus_devices()
> which is now empty since it's only called by the powerpc code, it's not
> a generic hook.
> 
> However, here's the snag. Unless I missed something, we now setup the devices
> DMA before we call pcibios_fixup_bus(). And *that* is going to break some
> pseries.
> 
> We have an assumption in there that the bus fixup is done first, because in
> some cases, the DMA windows are established at the bus level, and the "dev"
> setup just picks up the bits.
> 
> Now looking at that code, it's not unfixable but it won't make 3.10. Maybe
> we need a new pre-scan hook for busses... we can use the pcibios_add_device()
> hook of the bridge itself for P2P but that won't do for the root bus and I
> don't like having two different path here...
> 
Hi Ben,

you are right, pcibios_fixup_bus() is called after pcibios_add_device(),
at least in the initial scan.

Can you point me to some of the breaking code ? I guess it must be in some of
the pci_dma_dev_setup callbacks, but those I looked at only check devicetree
data or simply set function pointers, both of which should not be affected by
the call order.

How about pcibios_fixup_device, to be called after pcibios_fixup_bus ?

Thanks,
Guenter
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 1/2] acpi: video: add function to support unregister backlight interface

2013-06-05 Thread Aaron Lu

On 05/31/2013 12:10 PM, Lee, Chun-Yi wrote:
> There have some situation we unregister whole acpi/video driver by downstream
> driver just want to remove backlight control interface of acpi/video. It caues
> we lost other functions of acpi/video, e.g. transfer acpi event to input 
> event.
> 
> So, this patch add a new function, find_video_unregister_backlight, it provide
> the interface let downstream driver can tell acpi/video to unregister 
> backlight
> interface of all acpi video devices. Then we can keep functions of acpi/video
> but only remove backlight support.

It doesn't seem to be the best way to solve this problem to me, as the
platform driver doesn't need to be dependent on ACPI video driver and
ACPI video driver shouldn't handle things like these.

The current backlight model has limitations to solve problems like this,
also bear in mind we have some thinkpad models that have similar
problems. I sent the email hoping we can have a discussion on this
topic a while ago:
http://marc.info/?l=linux-acpi=136507538826872=2
Unfortunately nobody seems interested. 

I'm thinking how we can deal with such problems altogether, introducing
something like backlight manager seems to be a necessary thing to do now.

Thanks,
Aaron

> 
> Reference: bko#35622
> https://bugzilla.kernel.org/show_bug.cgi?id=35622
> 
> v2: Also unregister cooling devices.
> 
> Tested-by: Andrzej Krentosz 
> Cc: Zhang Rui 
> Cc: Len Brown 
> Cc: Rafael J. Wysocki 
> Cc: Carlos Corbacho 
> Cc: Matthew Garrett 
> Cc: Dmitry Torokhov 
> Cc: Corentin Chary 
> Cc: Aaron Lu 
> Cc: Thomas Renninger 
> Signed-off-by: Lee, Chun-Yi 
> ---
>  drivers/acpi/video.c |   54 
> ++
>  include/acpi/video.h |2 +
>  2 files changed, 56 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
> index c3932d0..f21104d 100644
> --- a/drivers/acpi/video.c
> +++ b/drivers/acpi/video.c
> @@ -1861,6 +1861,60 @@ static int __init intel_opregion_present(void)
>   return opregion;
>  }
>  
> +static acpi_status
> +find_video_unregister_backlight(acpi_handle handle, u32 lvl, void *context,
> + void **rv)
> +{
> + struct acpi_device *acpi_dev;
> + struct acpi_video_bus *video;
> + struct acpi_video_device *dev, *next;
> +
> + if (acpi_bus_get_device(handle, _dev))
> + return AE_OK;
> +
> + if (!acpi_match_device_ids(acpi_dev, video_device_ids)) {
> + video = acpi_driver_data(acpi_dev);
> + acpi_video_bus_stop_devices(video);
> + mutex_lock(>device_list_lock);
> + list_for_each_entry_safe(dev, next, >video_device_list,
> + entry) {
> + if (dev->backlight) {
> + backlight_device_unregister(dev->backlight);
> + dev->backlight = NULL;
> + kfree(dev->brightness->levels);
> + kfree(dev->brightness);
> + }
> + if (dev->cooling_dev) {
> + sysfs_remove_link(>dev->dev.kobj,
> +   "thermal_cooling");
> + 
> sysfs_remove_link(>cooling_dev->device.kobj,
> +   "device");
> + 
> thermal_cooling_device_unregister(dev->cooling_dev);
> + dev->cooling_dev = NULL;
> + }
> + }
> + mutex_unlock(>device_list_lock);
> + acpi_video_bus_start_devices(video);
> + }
> + return AE_OK;
> +}
> +
> +void acpi_video_backlight_unregister(void)
> +{
> + if (!register_count) {
> + /*
> +  * If the acpi video bus is already unloaded, don't
> +  * unregister backlight of devices and return directly.
> +  */
> + return;
> + }
> + acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT,
> + ACPI_UINT32_MAX, find_video_unregister_backlight,
> + NULL, NULL, NULL);
> + return;
> +}
> +EXPORT_SYMBOL(acpi_video_backlight_unregister);
> +
>  int acpi_video_register(void)
>  {
>   int result = 0;
> diff --git a/include/acpi/video.h b/include/acpi/video.h
> index 61109f2..1e810a1 100644
> --- a/include/acpi/video.h
> +++ b/include/acpi/video.h
> @@ -19,11 +19,13 @@ struct acpi_device;
>  #if (defined CONFIG_ACPI_VIDEO || defined CONFIG_ACPI_VIDEO_MODULE)
>  extern int acpi_video_register(void);
>  extern void acpi_video_unregister(void);
> +extern void acpi_video_backlight_unregister(void);
>  extern int acpi_video_get_edid(struct acpi_device *device, int type,
>  int device_id, void **edid);
>  #else
>  static inline int acpi_video_register(void) { return 0; }
>  static inline void

Re: [git pull] drm fixes

2013-06-05 Thread Linus Torvalds

On Thu, Jun 6, 2013 at 2:14 PM, Dave Airlie  wrote:
>
>  7 files changed, 32 insertions(+), 42 deletions(-)

That's not at all what I get (including shortlog). I got

 29 files changed, 188 insertions(+), 68 deletions(-)

from a lot of commits you don't list.

  Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[git pull] drm fixes

2013-06-05 Thread Dave Airlie


Hi Linus,

regression fixers for the big 3:
nouveau: hdmi audio, dac load detect, s/r regressions fixed
radeon: long standing system hang fixed, hdmi audio and rs780 fast fb 
fixes
intel: one old regression, a WARN removal, and a stop X dying fix

otherwise one mgag200 fix, a couple of arm build fixes, and a core use 
after free fix.

Dave.

The following changes since commit c89b65e7fffef745bdd36c372aa0dea778fecbab:

  qxl: fix Kconfig deps - select FB_DEFERRED_IO (2013-05-28 17:03:37 +1000)

are available in the git repository at:

  git://people.freedesktop.org/~airlied/linux drm-fixes

for you to fetch changes up to 970fa986fadb1165cf38b45b70e98302a3bee497:

  drm/qxl: fix build warnings on 32-bit (2013-05-31 12:45:09 +1000)


Alex Deucher (4):
  drm/radeon: fix typo in cu_per_sh on verde
  drm/radeon: fix card_posted check for newer asics
  drm/radeon: don't check crtcs in card_posted() on cards without DCE
  drm/radeon: narrow scope of Apple re-POST hack

Christian König (1):
  drm/radeon: UVD block on SUMO2 is the same as on SUMO

Dave Airlie (2):
  Merge branch 'drm-fixes-3.10' of 
git://people.freedesktop.org/~agd5f/linux into drm-next
  drm/qxl: fix build warnings on 32-bit

Kleber Sacilotto de Souza (1):
  radeon: use max_bus_speed to activate gen2 speeds

 drivers/gpu/drm/qxl/qxl_ioctl.c|  4 ++--
 drivers/gpu/drm/qxl/qxl_kms.c  |  9 +
 drivers/gpu/drm/radeon/evergreen.c | 10 +++---
 drivers/gpu/drm/radeon/r600.c  |  9 ++---
 drivers/gpu/drm/radeon/radeon_device.c | 27 ---
 drivers/gpu/drm/radeon/rv770.c | 13 +++--
 drivers/gpu/drm/radeon/si.c|  2 +-
 7 files changed, 32 insertions(+), 42 deletions(-)

Re: [PATCH] Modify UEFI anti-bricking code

2013-06-05 Thread joeyli

於 三，2013-06-05 於 16:59 +0100，Matt Fleming 提到：
> On Wed, 05 Jun, at 02:53:27PM, Matthew Garrett wrote:
> > On Wed, 2013-06-05 at 15:49 +0100, Fleming, Matt wrote:
> > 
> > > Folks, what do you want me to do with this? Merge it with Matthew's patch?
> > 
> > Do that and add Joey's signed-off-by?
> 
> Right, this is what I've got queued up.
> 
> ---
> 
> >From 380dcc12ba82f4e10feb6a72207b2e4771d16d8d Mon Sep 17 00:00:00 2001
> From: Matthew Garrett 
> Date: Sat, 1 Jun 2013 16:06:20 -0400
> Subject: [PATCH] Modify UEFI anti-bricking code
> 
> This patch reworks the UEFI anti-bricking code, including an effective
> reversion of cc5a080c and 31ff2f20. It turns out that calling
> QueryVariableInfo() from boot services results in some firmware
> implementations jumping to physical addresses even after entering virtual
> mode, so until we have 1:1 mappings for UEFI runtime space this isn't
> going to work so well.
[...]

The follow diff change is base on 380dcc12 patch queued in efi git tree,
it included Matthew and hpa's suggestions. I fix the attributes of DUMMY
object to NV/BS/RT and introduced a #define of the minimum reserve flash
space.

This change works to me on OVMF.



Thanks a lot!
Joey Lee

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index cc3cfe8..2617675 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -53,6 +53,8 @@
 
 #define EFI_DEBUG  1
 
+#define EFI_MIN_RESERVE 5120
+
 #define EFI_DUMMY_GUID \
EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 
0x9f, 0x92, 0xa9)
 
@@ -988,7 +990,11 @@ void __init efi_enter_virtual_mode(void)
kfree(new_memmap);
 
/* clean DUMMY object */
-   efi.set_variable(efi_dummy_name, _DUMMY_GUID, 0, 0, NULL);
+   efi.set_variable(efi_dummy_name, _DUMMY_GUID,
+EFI_VARIABLE_NON_VOLATILE |
+EFI_VARIABLE_BOOTSERVICE_ACCESS |
+EFI_VARIABLE_RUNTIME_ACCESS,
+0, NULL);
 }
 
 /*
@@ -1051,7 +1057,12 @@ efi_status_t efi_query_variable_store(u32 attributes, 
unsigned long size)
 * write if permitting it would reduce the available space to under
 * 5KB. This figure was provided by Samsung, so should be safe.
 */
-   if ((remaining_size - size < 5120) && !efi_no_storage_paranoia) {
+   if ((remaining_size - size < EFI_MIN_RESERVE) &&
+   !efi_no_storage_paranoia) {
+
+   if (!(attributes & EFI_VARIABLE_NON_VOLATILE))
+   return EFI_OUT_OF_RESOURCES;
+
/*
 * Triggering garbage collection may require that the firmware
 * generate a real EFI_OUT_OF_RESOURCES error. We can force
@@ -1061,7 +1072,10 @@ efi_status_t efi_query_variable_store(u32 attributes, 
unsigned long size)
void *dummy = kmalloc(dummy_size, GFP_ATOMIC);
 
status = efi.set_variable(efi_dummy_name, _DUMMY_GUID,
- attributes, dummy_size, dummy);
+ EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS,
+ dummy_size, dummy);
 
if (status == EFI_SUCCESS) {
/*
@@ -1085,7 +1099,7 @@ efi_status_t efi_query_variable_store(u32 attributes, 
unsigned long size)
/*
 * There still isn't enough room, so return an error
 */
-   if (remaining_size - size < 5120)
+   if (remaining_size - size < EFI_MIN_RESERVE)
return EFI_OUT_OF_RESOURCES;
}
 







--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 5/5] net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg

2013-06-05 Thread David Miller

From: Eric Dumazet 
Date: Wed, 05 Jun 2013 21:35:25 -0700

> On Thu, 2013-06-06 at 12:56 +1000, Michael Neuling wrote:
>> On Thu, May 23, 2013 at 7:07 AM, Andy Lutomirski  wrote:
>> > MSG_CMSG_COMPAT is (AFAIK) not intended to be part of the API --
>> > it's a hack that steals a bit to indicate to other networking code
>> > that a compat entry was used.  So don't allow it from a non-compat
>> > syscall.
>> 
>> Dave & Linus
>> 
>> This is causing a regression on 64bit powerpc with 32bit usermode.
>> When I hit userspace, udev is broken and I suspect all networking is
>> broken as well.
>> 
>> Can we please revert 1be374a0518a288147c6a7398792583200a67261 upstream?
>> 
> 
> It seems to also break x86_64, if using 32bit usermode.

Sorry, I only merged this because Ingo Molnar and others kept beating
me over the head about merging this fix.

Linus please revert, and I will not bow to such pressure in the future,
I should know better.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 1/2] arch: invoke oom-killer from page fault

2013-06-05 Thread David Rientjes

On Thu, 6 Jun 2013, Johannes Weiner wrote:

> From: Johannes Weiner 
> Subject: [patch] mm: invoke oom-killer from remaining unconverted page fault
>  handlers
> 
> A few remaining architectures directly kill the page faulting task in
> an out of memory situation.  This is usually not a good idea since
> that task might not even use a significant amount of memory and so may
> not be the optimal victim to resolve the situation.
> 
> Since '1c0fe6e mm: invoke oom-killer from page fault' (2.6.29) there
> is a hook that architecture page fault handlers are supposed to call
> to invoke the OOM killer and let it pick the right task to kill.
> Convert the remaining architectures over to this hook.
> 
> To have the previous behavior of simply taking out the faulting task
> the vm.oom_kill_allocating_task sysctl can be set to 1.
> 
> Signed-off-by: Johannes Weiner 

Acked-by: David Rientjes 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] [RFC]Watchdog:core: constant pinging until userspace timesout when delay very less

2013-06-05 Thread Guenter Roeck

On Thu, Jun 06, 2013 at 08:30:01AM +0530, anish singh wrote:
> Hello Wim Van,
> Can you look into below?
> 
Please be patient. Wim tends to be busy.

Guenter

> On Wed, Jun 5, 2013 at 8:39 AM, anish singh  
> wrote:
> > Hello Wim Van Sabroeck,
> > Can I get your inputs on this?
> >
> > On Tue, Jun 4, 2013 at 8:39 AM, anish singh  
> > wrote:
> >> On Tue, Jun 4, 2013 at 3:55 AM, Guenter Roeck  wrote:
> >>> On Mon, Jun 03, 2013 at 10:23:04PM +0530, anish singh wrote:
>  On Mon, Jun 3, 2013 at 8:57 PM, Guenter Roeck  wrote:
>  > On Sun, Jun 02, 2013 at 03:43:07PM +0530, anish kumar wrote:
>  >> Certain watchdog drivers use a timer to keep kicking the watchdog at
>  >> a rate of 0.5s (HZ/2) untill userspace times out.They do this as
>  >> we can't guarantee that watchdog will be pinged fast enough
>  >> for all system loads, especially if timeout is configured for
>  >> less than or equal to 1 second(basically small values).
>  >>
>  >> As suggested by Wim Van Sebroeck & Guenter Roeck we should
>  >> add this functionality of individual watchdog drivers in the core
>  >> watchdog core.
>  >>
>  >> Signed-off-by: anish kumar 
>  >
>  > Not exactly what I had in mind. My idea was to enable the softdog only 
>  > if
>  > the hardware watchdog's maximum timeout was low (say, less than a 
>  > couple
>  > of minutes), and if a timeout larger than its maximum value was 
>  > configured.
> 
>  watchdog_timeout_invalid wouldn't this check will fail if the user space 
>  tries
>  to set maximum timeout more that what driver can support?It would work
>  for pika_wdt.c as it is old watchdog driver and doesn't register with 
>  watchdog
>  framwork but new drivers has to pass this api.
> 
>  OR
> 
>  Do you want to remove this check and go as explained by you?I would
>  favour this approach though.
> 
> >>> One would still have a check, but the enforced limits would no longer be
> >>> the driver limits, but larger limits implemented in the watchdog core.
> >> How much larger would be the big question here?Should it be configurable
> >> property(sysfs?) or some hardcoding based on existing drivers?
> >>
> >> Before going for next patch, it would be better for me to wait for some
> >> more comments.
> >>>
>  > In that case, I would have set the hardware watchdog to its maximum 
>  > value
>  > and use the softdog to ping it at a rate of, say, 50% of this maximum.
>  >
>  > If userspace would not ping the watchdog within its configured value,
>  > I would stop pinging the hardware watchdog and let it time out.
> 
>  One more question.Why is the return value of watchdog_ping int? Anyway
>  we discard it.
> >>>
> >>> I can not answer that question.
> >>>
> >>> Guenter
> --
> To unsubscribe from this list: send the line "unsubscribe linux-watchdog" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] memcg: clean up memcg->nodeinfo

2013-06-05 Thread David Rientjes

On Wed, 5 Jun 2013, Johannes Weiner wrote:

> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index ff7b40d..d169a8d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -187,10 +187,6 @@ struct mem_cgroup_per_node {
>   struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
>  };
>  
> -struct mem_cgroup_lru_info {
> - struct mem_cgroup_per_node *nodeinfo[0];
> -};
> -
>  /*
>   * Cgroups above their limits are maintained in a RB-Tree, independent of
>   * their hierarchy representation
> @@ -384,14 +380,9 @@ struct mem_cgroup {
>  #endif
>   /* when kmem shrinkers can sleep but can't proceed due to context */
>   struct work_struct kmemcg_shrink_work;
> - /*
> -  * Per cgroup active and inactive list, similar to the
> -  * per zone LRU lists.
> -  *
> -  * WARNING: This has to be the last element of the struct. Don't
> -  * add new fields after this point.
> -  */
> - struct mem_cgroup_lru_info info;
> +
> + struct mem_cgroup_per_node *nodeinfo[0];
> + /* WARNING: nodeinfo has to be the last member here */

Nice cleanup, but would this be better as a flexible array member?  It 
would have an incomplete type like it should have instead of sizeof 
returning 0.

>  };
>  
>  static size_t memcg_size(void)
> @@ -777,7 +768,7 @@ static struct mem_cgroup_per_zone *
>  mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
>  {
>   VM_BUG_ON((unsigned)nid >= nr_node_ids);
> - return >info.nodeinfo[nid]->zoneinfo[zid];
> + return >nodeinfo[nid]->zoneinfo[zid];
>  }
>  
>  struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
> @@ -6595,13 +6586,13 @@ static int alloc_mem_cgroup_per_zone_info(struct 
> mem_cgroup *memcg, int node)
>   mz->on_tree = false;
>   mz->memcg = memcg;
>   }
> - memcg->info.nodeinfo[node] = pn;
> + memcg->nodeinfo[node] = pn;
>   return 0;
>  }
>  
>  static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
>  {
> - kfree(memcg->info.nodeinfo[node]);
> + kfree(memcg->nodeinfo[node]);
>  }
>  
>  static struct mem_cgroup *mem_cgroup_alloc(void)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: mmotm 2013-06-05-17-24 uploaded (ptp_pch)

2013-06-05 Thread Randy Dunlap

On 06/05/13 17:26, a...@linux-foundation.org wrote:
> The mm-of-the-moment snapshot 2013-06-05-17-24 has been uploaded to
> 
>http://www.ozlabs.org/~akpm/mmotm/
> 
> mmotm-readme.txt says
> 
> README for mm-of-the-moment:
> 
> http://www.ozlabs.org/~akpm/mmotm/
> 

on i386:  when CONFIG_PCI is not enabled:

drivers/ptp/ptp_pch.c:710:1: warning: data definition has no type or storage 
class [enabled by default]
drivers/ptp/ptp_pch.c:710:1: warning: type defaults to 'int' in declaration of 
'module_pci_driver' [-Wimplicit-int]
drivers/ptp/ptp_pch.c:710:1: warning: parameter names (without types) in 
function declaration [enabled by default]
drivers/ptp/ptp_pch.c:701:26: warning: 'pch_driver' defined but not used 
[-Wunused-variable]


-- 
~Randy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 1/4] ARM: davinci: dm365 evm: remove init_enable from ths7303 pdata

2013-06-05 Thread Sekhar Nori

On 5/25/2013 11:09 PM, Prabhakar Lad wrote:
> From: Lad, Prabhakar 
> 
> remove init_enable from ths7303 pdata as it is being dropped
> from ths7303_platform_data.
> 
> Signed-off-by: Lad, Prabhakar 
> Cc: Sekhar Nori 
> Cc: Hans Verkuil 
> Cc: Laurent Pinchart 
> Cc: Mauro Carvalho Chehab 
> Cc: linux-kernel@vger.kernel.org
> Cc: davinci-linux-open-sou...@linux.davincidsp.com

Acked-by: Sekhar Nori 

I would prefer this be squashed into 2/4 but I leave it to you.

Thanks,
Sekhar
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 1/2] arch: invoke oom-killer from page fault

2013-06-05 Thread Johannes Weiner

On Wed, Jun 05, 2013 at 08:57:44PM -0700, David Rientjes wrote:
> On Wed, 5 Jun 2013, Johannes Weiner wrote:
> 
> > Since '1c0fe6e mm: invoke oom-killer from page fault', page fault
> > handlers should not directly kill faulting tasks in an out of memory
> > condition.
> 
> I have no objection to the patch, but there's no explanation given here 
> why exiting with a kill shouldn't be done.  Is it because of memory 
> reserves and there is no guarantee that current will be able to exit?  Or 
> is it just for consistency with other archs?
> 
> > Instead, they should be invoking the OOM killer to pick
> > the right task.  Convert the remaining architectures.
> > 
> 
> If this is a matter of memory reserves, I guess you could point people who 
> want the current behavior (avoiding the expensiveness of the tasklist scan 
> in the oom killer for example) to /proc/sys/vm/oom_kill_allocating_task?
> 
> This changelog is a bit cryptic in its motivation.

Fixing copy-pasted bitrot^W^W^W^WHow about this?

---
From: Johannes Weiner 
Subject: [patch] mm: invoke oom-killer from remaining unconverted page fault
 handlers

A few remaining architectures directly kill the page faulting task in
an out of memory situation.  This is usually not a good idea since
that task might not even use a significant amount of memory and so may
not be the optimal victim to resolve the situation.

Since '1c0fe6e mm: invoke oom-killer from page fault' (2.6.29) there
is a hook that architecture page fault handlers are supposed to call
to invoke the OOM killer and let it pick the right task to kill.
Convert the remaining architectures over to this hook.

To have the previous behavior of simply taking out the faulting task
the vm.oom_kill_allocating_task sysctl can be set to 1.

Signed-off-by: Johannes Weiner 
---
 arch/arc/mm/fault.c  | 6 --
 arch/metag/mm/fault.c| 6 --
 arch/mn10300/mm/fault.c  | 7 ---
 arch/openrisc/mm/fault.c | 8 
 arch/score/mm/fault.c| 8 
 arch/tile/mm/fault.c | 8 
 6 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index c0decc1..d5ec60a 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -207,8 +207,10 @@ out_of_memory:
}
up_read(>mmap_sem);
 
-   if (user_mode(regs))
-   do_group_exit(SIGKILL); /* This will never return */
+   if (user_mode(regs)) {
+   pagefault_out_of_memory();
+   return;
+   }
 
goto no_context;
 
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
index 2c75bf7..8fddf46 100644
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -224,8 +224,10 @@ do_sigbus:
 */
 out_of_memory:
up_read(>mmap_sem);
-   if (user_mode(regs))
-   do_group_exit(SIGKILL);
+   if (user_mode(regs)) {
+   pagefault_out_of_memory();
+   return 1;
+   }
 
 no_context:
/* Are we prepared to handle this kernel fault?  */
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index d48a84f..8a2e6de 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -345,9 +345,10 @@ no_context:
  */
 out_of_memory:
up_read(>mmap_sem);
-   printk(KERN_ALERT "VM: killing process %s\n", tsk->comm);
-   if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
-   do_exit(SIGKILL);
+   if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) {
+   pagefault_out_of_memory();
+   return;
+   }
goto no_context;
 
 do_sigbus:
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index e2bfafc..4a41f84 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -267,10 +267,10 @@ out_of_memory:
__asm__ __volatile__("l.nop 1");
 
up_read(>mmap_sem);
-   printk("VM: killing process %s\n", tsk->comm);
-   if (user_mode(regs))
-   do_exit(SIGKILL);
-   goto no_context;
+   if (!user_mode(regs))
+   goto no_context;
+   pagefault_out_of_memory();
+   return;
 
 do_sigbus:
up_read(>mmap_sem);
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 47b600e..6b18fb0 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -172,10 +172,10 @@ out_of_memory:
down_read(>mmap_sem);
goto survive;
}
-   printk("VM: killing process %s\n", tsk->comm);
-   if (user_mode(regs))
-   do_group_exit(SIGKILL);
-   goto no_context;
+   if (!user_mode(regs))
+   goto no_context;
+   pagefault_out_of_memory();
+   return;
 
 do_sigbus:
up_read(>mmap_sem);
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 3d2b81c..f7f99f9 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -573,10 +573,10 @@ out_of_memory:
down_read(>mmap_sem);
goto survive;

Re: [PATCH 5/5] net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg

2013-06-05 Thread Eric Dumazet

On Thu, 2013-06-06 at 12:56 +1000, Michael Neuling wrote:
> On Thu, May 23, 2013 at 7:07 AM, Andy Lutomirski  wrote:
> > MSG_CMSG_COMPAT is (AFAIK) not intended to be part of the API --
> > it's a hack that steals a bit to indicate to other networking code
> > that a compat entry was used.  So don't allow it from a non-compat
> > syscall.
> 
> Dave & Linus
> 
> This is causing a regression on 64bit powerpc with 32bit usermode.
> When I hit userspace, udev is broken and I suspect all networking is
> broken as well.
> 
> Can we please revert 1be374a0518a288147c6a7398792583200a67261 upstream?
> 

It seems to also break x86_64, if using 32bit usermode.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: mmotm 2013-06-05-17-24 uploaded (rtc)

2013-06-05 Thread Randy Dunlap

On 06/05/13 17:26, a...@linux-foundation.org wrote:
> The mm-of-the-moment snapshot 2013-06-05-17-24 has been uploaded to
> 
>http://www.ozlabs.org/~akpm/mmotm/
> 
> mmotm-readme.txt says
> 
> README for mm-of-the-moment:
> 
> http://www.ozlabs.org/~akpm/mmotm/
> 

on i386 when CONFIG_PM is not enabled:

drivers/rtc/class.c:339:18: error: lvalue required as unary '&' operand

due to:
#define rtc_class_dev_pm_opsNULL



-- 
~Randy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Fix lockup related to stop_machine being stuck in __do_softirq.

2013-06-05 Thread Eric Dumazet

On Wed, 2013-06-05 at 21:25 -0700, gree...@candelatech.com wrote:
> From: Ben Greear 

> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index 14d7758..f150ad6 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -204,6 +204,7 @@ EXPORT_SYMBOL(local_bh_enable_ip);
>   * should not be able to lock up the box.

Could you update the comment ?

I had the following :

- * We restart softirq processing for at most 2 ms,
- * and if need_resched() is not set.
+ * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
+ * but break the loop if need_resched() is set or after 2 ms.


>   */
>  #define MAX_SOFTIRQ_TIME  msecs_to_jiffies(2)
> +#define MAX_SOFTIRQ_RESTART 10
>  
>  asmlinkage void __do_softirq(void)
>  {
> @@ -212,6 +213,7 @@ asmlinkage void __do_softirq(void)
>   unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
>   int cpu;
>   unsigned long old_flags = current->flags;
> + int max_restart = MAX_SOFTIRQ_RESTART;
>  
>   /*
>* Mask out PF_MEMALLOC s current task context is borrowed for the
> @@ -265,7 +267,8 @@ restart:
>  
>   pending = local_softirq_pending();
>   if (pending) {
> - if (time_before(jiffies, end) && !need_resched())
> + if (time_before(jiffies, end) && !need_resched()
> + && --max_restart)
>   goto restart;
>  
>   wakeup_softirqd();


if (cond1 && cond2 &&
cond3) ...

or

if (cond1 &&
cond2 &&
cond3) ...

Thanks !


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] Fix lockup related to stop_machine being stuck in __do_softirq.

2013-06-05 Thread greearb

From: Ben Greear 

The stop machine logic can lock up if all but one of
the migration threads make it through the disable-irq
step and the one remaining thread gets stuck in
__do_softirq.  The reason __do_softirq can hang is
that it has a bail-out based on jiffies timeout, but
in the lockup case, jiffies itself is not incremented.

To work around this, re-add the max_restart counter in __do_irq
and stop processing irqs after 10 restarts.

Thanks to Tejun Heo and Rusty Russell and others for
helping me track this down.

This was introduced in 3.9 by commit:  c10d73671ad30f5469
(softirq:  reduce latencies).

It may be worth looking into ath9k to see if it has issues with
it's irq handler at a later date.

The hang stack traces look something like this:

[ cut here ]
WARNING: at kernel/watchdog.c:245 watchdog_overflow_callback+0x9c/0xa7()
Hardware name: To be filled by O.E.M.
Watchdog detected hard LOCKUP on cpu 2
Modules linked in: ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 nfsv4 
auth_rpcgss nfs fscache nf_nat_ipv4 nf_nat veth 8021q garp stp mrp llc pktgen 
lockd sunrpc]
Pid: 23, comm: migration/2 Tainted: G C   3.9.4+ #11
Call Trace:
   [] warn_slowpath_common+0x85/0x9f
 [] warn_slowpath_fmt+0x46/0x48
 [] watchdog_overflow_callback+0x9c/0xa7
 [] __perf_event_overflow+0x137/0x1cb
 [] ? x86_perf_event_set_period+0x103/0x10f
 [] perf_event_overflow+0x14/0x16
 [] intel_pmu_handle_irq+0x2dc/0x359
 [] perf_event_nmi_handler+0x19/0x1b
 [] nmi_handle+0x7f/0xc2
 [] ? oops_begin+0xa9/0xa9
 [] do_nmi+0xbc/0x304
 [] end_repeat_nmi+0x1e/0x2e
 [] ? vprintk_emit+0x40a/0x444
 [] ? stop_machine_cpu_stop+0xd8/0x274
 [] ? stop_machine_cpu_stop+0xd8/0x274
 [] ? stop_machine_cpu_stop+0xd8/0x274
 <>  [] ? copy_module_from_fd+0xe7/0xe7
 [] ? copy_module_from_fd+0xe7/0xe7
 [] ? copy_module_from_fd+0xe7/0xe7
 [] ? stop_one_cpu_nowait+0x30/0x30
 [] cpu_stopper_thread+0xae/0x162
 [] ? __schedule+0x5ef/0x637
 [] ? _raw_spin_unlock_irqrestore+0x47/0x7e
 [] ? trace_hardirqs_on_caller+0x123/0x15a
 [] ? trace_hardirqs_on+0xd/0xf
 [] ? _raw_spin_unlock_irqrestore+0x70/0x7e
 [] smpboot_thread_fn+0x258/0x260
 [] ? test_ti_thread_flag.clone.0+0x11/0x11
 [] kthread+0xc7/0xcf
 [] ? __init_kthread_worker+0x5b/0x5b
 [] ret_from_fork+0x7c/0xb0
 [] ? __init_kthread_worker+0x5b/0x5b
---[ end trace 4947dfa9b0a4cec3 ]---
BUG: soft lockup - CPU#1 stuck for 22s! [migration/1:17]
Modules linked in: ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 nfsv4 
auth_rpcgss nfs fscache nf_nat_ipv4 nf_nat veth 8021q garp stp mrp llc pktgen 
lockd sunrpc]
irq event stamp: 835637905
hardirqs last  enabled at (835637904): [] 
__do_softirq+0x9f/0x257
hardirqs last disabled at (835637905): [] 
apic_timer_interrupt+0x6d/0x80
softirqs last  enabled at (5654720): [] 
__do_softirq+0x1ff/0x257
softirqs last disabled at (5654725): [] irq_exit+0x5f/0xbb
CPU 1
Pid: 17, comm: migration/1 Tainted: GWC   3.9.4+ #11 To be filled by 
O.E.M. To be filled by O.E.M./To be filled by O.E.M.
RIP: 0010:[]  [] tasklet_hi_action+0xf0/0xf0
RSP: 0018:88022bc83ef0  EFLAGS: 0212
RAX: 0006 RBX: 880217deb710 RCX: 0006
RDX: 0006 RSI:  RDI: 81a050b0
RBP: 88022bc83f78 R08: 81a050b0 R09: 88022bc83cc8
R10: 05f2 R11: 8802203aaf50 R12: 88022bc83e68
R13: 815f48b2 R14: 88022bc83f78 R15: 88022230e000
FS:  () GS:88022bc8() knlGS:
CS:  0010 DS:  ES:  CR0: 8005003b
CR2: 00430070 CR3: 0001cbc5d000 CR4: 07e0
DR0:  DR1:  DR2: 
DR3:  DR6: 0ff0 DR7: 0400
Process migration/1 (pid: 17, threadinfo 88022230e000, task 
8802223142c0)
Stack:
 8109f539 88022bc83f08 88022230e010 042080402bc83f88
 00010021bfcd 00012bc83fa8 88022230e000 88022230ffd8
 0030 88020006 0248d8cdab1c 1304da35fe841722
Call Trace:
 
 [] ? __do_softirq+0x117/0x257
 [] irq_exit+0x5f/0xbb
 [] smp_apic_timer_interrupt+0x8a/0x98
 [] apic_timer_interrupt+0x72/0x80
 
 [] ? vprintk_emit+0x417/0x444
 [] printk+0x4d/0x4f
 [] ? cpu_stopper_thread+0x57/0x162
 [] stop_machine_cpu_stop+0x22c/0x274
 [] ? copy_module_from_fd+0xe7/0xe7
 [] ? copy_module_from_fd+0xe7/0xe7
 [] ? copy_module_from_fd+0xe7/0xe7
 [] ? stop_one_cpu_nowait+0x30/0x30
 [] cpu_stopper_thread+0xae/0x162
 [] ? __schedule+0x5ef/0x637
 [] ? _raw_spin_unlock_irqrestore+0x47/0x7e
 [] ? trace_hardirqs_on_caller+0x123/0x15a
 [] ? trace_hardirqs_on+0xd/0xf
 [] ? _raw_spin_unlock_irqrestore+0x70/0x7e
 [] smpboot_thread_fn+0x258/0x260
 [] ? test_ti_thread_flag.clone.0+0x11/0x11
 [] kthread+0xc7/0xcf
 [] ? __init_kthread_worker+0x5b/0x5b
 [] ret_from_fork+0x7c/0xb0
 [] ? __init_kthread_worker+0x5b/0x5b
Code: 1c 25 18 e2 00 00 e8 cd fe ff ff e8 ac a4 04 00 fb 66 66 90 66 66 90 4c 
89 e3 48 85 db 0f 85

Re: [PATCH] ARM: avoid mis-detecting some V7 cores in the decompressor

2013-06-05 Thread Nicolas Pitre

On Wed, 5 Jun 2013, Stephen Boyd wrote:

> On 06/04, Nicolas Pitre wrote:
> > 
> > The LC0 area should be considered read-only as it may be located in 
> > flash.
> > 
> > Here's what I came with instead:
> > 
> > From: Nicolas Pitre 
> > Date: Tue, 4 Jun 2013 17:01:30 -0400
> > Subject: [PATCH] ARM: zImage: don't overwrite ourself with a page table
> > 
> > When zImage is loaded into RAM at a low address but TEXT_OFFSET
> > is set higher, we risk overwriting ourself with the page table
> > needed to turn on the cache as it is located relative to the relocation
> > address.  Let's defer the cache setup after relocation in that case.
> > 
> > Signed-off-by: Nicolas Pitre 
> 
> Reported-by: Stephen Boyd 
> Tested-by: Stephen Boyd 
> 
> This one passes testing on my two platforms with and without the
> 2Mb reservation at the beginning of ram. Seems like a good enough
> compromise for me.

Good!  Queued here:

http://www.arm.linux.org.uk/developer/patches/viewpatch.php?id=7751/1


Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/5] Broken DM816x support in Linux 3.10-rc4

2013-06-05 Thread Paul Walmsley

Hi,

also,

On Wed, 5 Jun 2013, Aida Mynzhasova wrote:

> Aida Mynzhasova (5):
>   ARM: OMAP: AM33xx: multiple renames for early initialization

If this patch is what's responsible for all the file renaming, please drop 
it.  Looks from the change summary that it's just useless churn (although 
I haven't seen the patch here since it never made it to the lists)

- Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Fwd: [PATCH] usb: phy: samsung: Add support for EXYNOS4210

2013-06-05 Thread Praveen Paneri

Hi,

On Tue, May 28, 2013 at 2:34 PM, Jingoo Han  wrote:
> Add support for EXYNOS4210 that includes USB EHCI/OHCI.
> Previous PHY initialization code is not correct; thus, it is modifed
 ^ You might
want to say "previous PHY init code does not support HOST and HSIC
PHY."

> to support EXYNOS4210 PHY. Also, after common clock framework for
> Samsung is added, clock name is defined as 'usb_device'.
>
> Signed-off-by: Jingoo Han 
> ---
> Tested on Exynos4210.
>
>  drivers/usb/phy/phy-samsung-usb.h  |   35 ++---
>  drivers/usb/phy/phy-samsung-usb2.c |   74 
> +---
>  2 files changed, 98 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/usb/phy/phy-samsung-usb.h 
> b/drivers/usb/phy/phy-samsung-usb.h
> index 70a9cae..34c35e8 100644
> --- a/drivers/usb/phy/phy-samsung-usb.h
> +++ b/drivers/usb/phy/phy-samsung-usb.h
> @@ -22,13 +22,22 @@
>
>  #define SAMSUNG_PHYPWR (0x00)
>
> +#define PHYPWR_PHY1_HSIC_NORMAL_MASK   (0xf << 9)
> +#define PHYPWR_PHY1_HSIC1_SLEEP(1 << 12)
> +#define PHYPWR_PHY1_HSIC1_FORCE_SUSPEND(1 << 11)
> +#define PHYPWR_PHY1_HSIC0_SLEEP(1 << 10)
> +#define PHYPWR_PHY1_HSIC0_FORCE_SUSPEND(1 << 9)
> +#define PHYPWR_PHY1_STD_NORMAL_MASK(0x7 << 6)
> +#define PHYPWR_PHY1_STD_SLEEP  (1 << 8)
> +#define PHYPWR_PHY1_STD_ANALOG_POWERDOWN   (1 << 7)
> +#define PHYPWR_PHY1_STD_FORCE_SUSPEND  (1 << 6)
>  #define PHYPWR_NORMAL_MASK (0x19 << 0)
>  #define PHYPWR_OTG_DISABLE (0x1 << 4)
>  #define PHYPWR_ANALOG_POWERDOWN(0x1 << 3)
>  #define PHYPWR_FORCE_SUSPEND   (0x1 << 1)
>  /* For Exynos4 */
This comment is misplaced?

> -#define PHYPWR_NORMAL_MASK_PHY0(0x39 << 0)
> -#define PHYPWR_SLEEP_PHY0  (0x1 << 5)
> +#define PHYPWR_PHY0_NORMAL_MASK(0x39 << 0)
> +#define PHYPWR_PHY0_SLEEP  (0x1 << 5)
>
>  #define SAMSUNG_PHYCLK (0x04)
>
> @@ -43,9 +52,25 @@
>
>  #define SAMSUNG_RSTCON (0x08)
>
> -#define RSTCON_PHYLINK_SWRST   (0x1 << 2)
> -#define RSTCON_HLINK_SWRST (0x1 << 1)
> -#define RSTCON_SWRST   (0x1 << 0)
> +#define RSTCON_HOST_LINK_PORT_SWRST_MASK   (0xf << 6)
> +#define RSTCON_HOST_LINK_PORT2_SWRST   (0x1 << 9)
> +#define RSTCON_HOST_LINK_PORT1_SWRST   (0x1 << 8)
> +#define RSTCON_HOST_LINK_PORT0_SWRST   (0x1 << 7)
> +#define RSTCON_HOST_LINK_ALL_SWRST (0x1 << 6)
> +#define RSTCON_PHY1_SWRST_MASK (0x7 << 3)
> +#define RSTCON_PHY1_HSIC_SWRST (0x1 << 5)
> +#define RSTCON_PHY1_STD_SWRST  (0x1 << 4)
> +#define RSTCON_PHY1_ALL_SWRST  (0x1 << 3)
> +#define RSTCON_PHY0_SWRST_MASK (0x7 << 0)
> +#define RSTCON_PHY0_PHYLINK_SWRST  (0x1 << 2)
> +#define RSTCON_PHY0_HLINK_SWRST(0x1 << 1)
> +#define RSTCON_PHY0_SWRST  (0x1 << 0)
> +
> +/* EXYNOS4 */
> +#define EXYNOS4_PHY1CON(0x34)
> +
> +#define PHY1CON_FPENABLEN  (0x1 << 0)
> +
>
>  /* EXYNOS5 */
>  #define EXYNOS5_PHY_HOST_CTRL0 (0x00)
> diff --git a/drivers/usb/phy/phy-samsung-usb2.c 
> b/drivers/usb/phy/phy-samsung-usb2.c
> index 9d5e273..4f93d84 100644
> --- a/drivers/usb/phy/phy-samsung-usb2.c
> +++ b/drivers/usb/phy/phy-samsung-usb2.c
> @@ -158,6 +158,15 @@ static void samsung_exynos5_usb2phy_enable(struct 
> samsung_usbphy *sphy)
> writel(ohcictrl, regs + EXYNOS5_PHY_HOST_OHCICTRL);
>  }
>
> +static bool exynos4_phyhost_is_on(void __iomem *regs)
> +{
> +   u32 reg;
> +
> +   reg = readl(regs + SAMSUNG_PHYPWR);
> +
> +   return !(reg & PHYPWR_PHY1_STD_ANALOG_POWERDOWN);
> +}
> +
>  static void samsung_usb2phy_enable(struct samsung_usbphy *sphy)
>  {
> void __iomem *regs = sphy->regs;
> @@ -165,6 +174,18 @@ static void samsung_usb2phy_enable(struct samsung_usbphy 
> *sphy)
> u32 phyclk;
> u32 rstcon;
>
> +   switch (sphy->drv_data->cpu_type) {
> +   case TYPE_EXYNOS4210:
> +   atomic_inc(>phy_usage);
> +
> +   if (exynos4_phyhost_is_on(regs)) {
> +   dev_info(sphy->dev, "Already power on PHY\n");
> +   return;
> +   }
> +   default:
> +   break;
> +   }
> +
> /* set clock frequency for PLL */
> phyclk = sphy->ref_clk_freq;
> phypwr = readl(regs + SAMSUNG_PHYPWR);
> @@ -174,22 +195,48 @@ static void samsung_usb2phy_enable(struct 
> samsung_usbphy *sphy)
> case TYPE_S3C64XX:
> phyclk &= ~PHYCLK_COMMON_ON_N;
>

Re: [patch 2/2] memcg: do not sleep on OOM waitqueue with full charge context

2013-06-05 Thread David Rientjes

On Wed, 5 Jun 2013, Johannes Weiner wrote:

> The memcg OOM handling is incredibly fragile because once a memcg goes
> OOM, one task (kernel or userspace) is responsible for resolving the
> situation.

Not sure what this means.  Are you referring to the case where the memcg 
is disabled from oom killing and we're relying on a userspace handler and 
it may be caught on the waitqueue itself?  Otherwise, are you referring to 
the task as eventually being the only one that takes the hierarchy oom 
lock and calls mem_cgroup_out_of_memory()?  I don't think you can make a 
case for the latter since the one that calls mem_cgroup_out_of_memory() 
should return and call memcg_wakeup_oom().  We obviously don't want to do 
any memory allocations in this path.

> Every other task that gets caught trying to charge memory
> gets stuck in a waitqueue while potentially holding various filesystem
> and mm locks on which the OOM handling task may now deadlock.
> 

What locks?  The oom killer quite extensively needs task_lock() but we 
shouldn't be calling it in the case where we hold this since its a 
spinlock and mem_cgroup_do_charge() never gets to the point of handling 
the oom.

Again, are you referring only to a userspace handler here?

> Do two things:
> 
> 1. When OOMing in a system call (buffered IO and friends), invoke the
>OOM killer but just return -ENOMEM, never sleep.  Userspace should
>be able to handle this.
> 

The sleep should never occur for any significant duration currently, the 
current implementation ensures one process calls 
mem_cgroup_out_of_memory() while all others sit on a waitqueue until that 
process returns from the oom killer and then they all wakeup again so the 
killed process may exit and all others can retry their allocations now 
that something has been killed.  If that process hasn't exited yet, the 
next process that locks the memcg oom hierarchy will see the killed 
process with TIF_MEMDIE and the oom killer is deferred.  So this sleep is 
very temporary already, I don't see why you're trying to make it faster 
while another thread finds a candidate task, it works quite well already.

> 2. When OOMing in a page fault and somebody else is handling the
>situation, do not sleep directly in the charging code.  Instead,
>remember the OOMing memcg in the task struct and then fully unwind
>the page fault stack first before going to sleep.
> 

Are you trying to address a situation here where the memcg oom handler 
takes too long to work?  We've quite extensively tried to improve that, 
especially by reducing its dependency on tasklist_lock which is contended 
from the writeside in the exit path and by only iterating processes that 
are attached to that memcg hierarchy and not all processes on the system 
like before.  I don't see the issue with scheduling other oom processes 
while one is doing mem_cgroup_out_of_memory().  (I would if the oom killer 
required things like mm->mmap_sem, but obviously it doesn't for reasons 
not related to memcg.)

> While reworking the OOM routine, also remove a needless OOM waitqueue
> wakeup when invoking the killer.  Only uncharges and limit increases,
> things that actually change the memory situation, should do wakeups.
> 

It's not needless at all, it's vitally required!  The oom killed process 
needs to be removed from the waitqueue and scheduled now with TIF_MEMDIE 
that the memcg oom killer provided so the allocation succeeds in the page 
allocator and memcg bypasses the charge so it can exit.

Exactly what problem are you trying to address with this patch?  I don't 
see any description of the user-visible effects or a specific xample of 
the scenario you're trying to address here.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: stop_machine lockup issue in 3.9.y.

2013-06-05 Thread Eric Dumazet

On Wed, 2013-06-05 at 20:50 -0700, Ben Greear wrote:
> On 06/05/2013 08:46 PM, Eric Dumazet wrote:
> >
> > We use in Google a patch triggering warning is a thread holds the cpu
> > without taking care to need_resched() for more than xx ms
> 
> Well, I'm sure that patch works nicely until the clock stops moving
> forward :)
> 

This is not using jiffies, but the clock used in kernel/sched/core.c,
with ns resolution ;)

> I'll post a patch with limit of 10 shortly.

ok


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 1/2] arch: invoke oom-killer from page fault

2013-06-05 Thread David Rientjes

On Wed, 5 Jun 2013, Johannes Weiner wrote:

> Since '1c0fe6e mm: invoke oom-killer from page fault', page fault
> handlers should not directly kill faulting tasks in an out of memory
> condition.

I have no objection to the patch, but there's no explanation given here 
why exiting with a kill shouldn't be done.  Is it because of memory 
reserves and there is no guarantee that current will be able to exit?  Or 
is it just for consistency with other archs?

> Instead, they should be invoking the OOM killer to pick
> the right task.  Convert the remaining architectures.
> 

If this is a matter of memory reserves, I guess you could point people who 
want the current behavior (avoiding the expensiveness of the tasklist scan 
in the oom killer for example) to /proc/sys/vm/oom_kill_allocating_task?

This changelog is a bit cryptic in its motivation.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

linux-next: manual merge of the kvm tree with the s390 tree

2013-06-05 Thread Stephen Rothwell

Hi all,

Today's linux-next merge of the kvm tree got a conflict in
arch/s390/include/asm/pgtable.h between commit 338679f7ba4a
("s390/pgtable: Fix guest overindication for change bit") from the s390
tree and commit 0d0dafc1e48f ("s390/kvm: rename RCP_xxx defines to
PGSTE_xxx") from the kvm tree.

I fixed it up (see below) and can carry the fix as necessary (no action
is required).

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au

diff --cc arch/s390/include/asm/pgtable.h
index e8b6e5b,1d0ad7d..000
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@@ -632,11 -628,10 +628,11 @@@ static inline void pgste_set_unlock(pte
  {
  #ifdef CONFIG_PGSTE
asm(
-   "   nihh%1,0xff7f\n"/* clear RCP_PCL_BIT */
+   "   nihh%1,0xff7f\n"/* clear PCL bit */
"   stg %1,%0\n"
: "=Q" (ptep[PTRS_PER_PTE])
 -  : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) : "cc");
 +  : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
 +  : "cc", "memory");
preempt_enable();
  #endif
  }
@@@ -712,19 -700,17 +708,19 @@@ static inline void pgste_set_key(pte_t 
  {
  #ifdef CONFIG_PGSTE
unsigned long address;
 -  unsigned long okey, nkey;
 +  unsigned long nkey;
  
 -  if (!pte_present(entry))
 +  if (pte_val(entry) & _PAGE_INVALID)
return;
 +  VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID));
address = pte_val(entry) & PAGE_MASK;
 -  okey = nkey = page_get_storage_key(address);
 -  nkey &= ~(_PAGE_ACC_BITS | _PAGE_FP_BIT);
 -  /* Set page access key and fetch protection bit from pgste */
 -  nkey |= (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
 -  if (okey != nkey)
 -  page_set_storage_key(address, nkey, 0);
 +  /*
 +   * Set page access key and fetch protection bit from pgste.
 +   * The guest C/R information is still in the PGSTE, set real
 +   * key C/R to 0.
 +   */
-   nkey = (pgste_val(pgste) & (RCP_ACC_BITS | RCP_FP_BIT)) >> 56;
++  nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
 +  page_set_storage_key(address, nkey, 0);
  #endif
  }
  


pgpDb2feB2nif.pgp
Description: PGP signature

Re: [PATCH 0/5] Broken DM816x support in Linux 3.10-rc4

2013-06-05 Thread Paul Walmsley

здравствуйте,

On Wed, 5 Jun 2013, Aida Mynzhasova wrote:

> Not so long ago I tried to boot Linux 3.10-rc4 kernel on DM816x EVM
> board. Unfortunately, my attempts were failed by reason of poor
> support of DM81xx-based devices in new kernels.

Yeah, TI pretty much gave up on trying to get support for that chip 
upstream a few years ago.  Same for DM814x.

> So, I suggest you this patch series, which fixes that kernel crush and 
> adds new functions/structures, required for early initialization on 
> DM816x (power and clock domains, hwmods). After applying these patches 
> the kernel is able to successfully continue booting till clock 
> initialization (will be added later).

OK, it will be great to get my DM8168EVM booting on mainline.  But there 
are a few preliminary issues with the patches:

1. All new chip and board support needs to be DT-only.  So, no new board 
files.  Also, all the hwmod data shouldn't have IRQ, DMA, etc. data - that 
should come from DT.  See for example the recently posted patch "[PATCH 
13/14] ARM: AM33XX: hwmod data: irq, dma and addr info clean up"

2. Looks like patch 2 is missing from the list.  If it's too big for the 
list, please break it down into smaller patches.

3. Did you write this code and data, hwmods, etc., or did it come from a 
TI kernel?  Please note this clearly in the patch descriptions.

- Paul

Re: stop_machine lockup issue in 3.9.y.

2013-06-05 Thread Ben Greear


On 06/05/2013 08:46 PM, Eric Dumazet wrote:

On Wed, 2013-06-05 at 20:41 -0700, Ben Greear wrote:

On 06/05/2013 08:26 PM, Eric Dumazet wrote:

On Wed, 2013-06-05 at 20:14 -0700, Tejun Heo wrote:



Ah, so, that's why it's showing up now.  We probably have had the same
issue all along but it used to be masked by the softirq limiting.  Do
you care to revive the 10 iterations limit so that it's limited by
both the count and timing?  We do wanna find out why softirq is
spinning indefinitely tho.


Yes, no problem, I can do that.


Limiting it to 5000 fixes my problem, so if you wanted it larger than 10, that 
would
be fine by me.

I can send a version of my patch easily enough if we can agree on the max 
number of
loops (and if indeed my version of the patch is acceptable).


Well, 10 was the prior limit and seems really fine.

The non update on jiffies seems quite exceptional condition (I hope...)

We use in Google a patch triggering warning is a thread holds the cpu
without taking care to need_resched() for more than xx ms


Well, I'm sure that patch works nicely until the clock stops moving
forward :)

I'll post a patch with limit of 10 shortly.

Thanks,
Ben



--
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/3 v3] dcache: make it more scalable on large system

2013-06-05 Thread Dave Chinner

On Wed, May 29, 2013 at 10:37:00PM +0200, Andi Kleen wrote:
> > As Dave said before, is the last path component sufficient?  Or how
> > about an inode number?
> 
> Neither works, the profiler needs to find the file and read it.
> 
> inode searching would be incredible expensive, unless the file system
> provided a "open-by-inode" primitive

That's effectively what fs/fhandle.c gives you.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: stop_machine lockup issue in 3.9.y.

2013-06-05 Thread Eric Dumazet

On Wed, 2013-06-05 at 20:41 -0700, Ben Greear wrote:
> On 06/05/2013 08:26 PM, Eric Dumazet wrote:
> > On Wed, 2013-06-05 at 20:14 -0700, Tejun Heo wrote:
> >
> >>
> >> Ah, so, that's why it's showing up now.  We probably have had the same
> >> issue all along but it used to be masked by the softirq limiting.  Do
> >> you care to revive the 10 iterations limit so that it's limited by
> >> both the count and timing?  We do wanna find out why softirq is
> >> spinning indefinitely tho.
> >
> > Yes, no problem, I can do that.
> 
> Limiting it to 5000 fixes my problem, so if you wanted it larger than 10, 
> that would
> be fine by me.
> 
> I can send a version of my patch easily enough if we can agree on the max 
> number of
> loops (and if indeed my version of the patch is acceptable).

Well, 10 was the prior limit and seems really fine.

The non update on jiffies seems quite exceptional condition (I hope...)

We use in Google a patch triggering warning is a thread holds the cpu
without taking care to need_resched() for more than xx ms



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: stop_machine lockup issue in 3.9.y.

2013-06-05 Thread Ben Greear


On 06/05/2013 08:26 PM, Eric Dumazet wrote:

On Wed, 2013-06-05 at 20:14 -0700, Tejun Heo wrote:



Ah, so, that's why it's showing up now.  We probably have had the same
issue all along but it used to be masked by the softirq limiting.  Do
you care to revive the 10 iterations limit so that it's limited by
both the count and timing?  We do wanna find out why softirq is
spinning indefinitely tho.


Yes, no problem, I can do that.


Limiting it to 5000 fixes my problem, so if you wanted it larger than 10, that 
would
be fine by me.

I can send a version of my patch easily enough if we can agree on the max 
number of
loops (and if indeed my version of the patch is acceptable).

Thanks,
Ben


--
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [PATCH v2] mfd: DT bindings for the palmas family MFD

2013-06-05 Thread J, KEERTHY

Hi Grant,

> -Original Message-
> From: Grant Likely [mailto:glik...@secretlab.ca] On Behalf Of Grant
> Likely
> Sent: Thursday, June 06, 2013 5:32 AM
> To: J, KEERTHY; linux-kernel@vger.kernel.org
> Cc: linux-...@vger.kernel.org; devicetree-disc...@lists.ozlabs.org;
> swar...@wwwdotorg.org; broo...@opensource.wolfsonmicro.com;
> rob.herr...@calxeda.com; r...@landley.net; sa...@linux.intel.com;
> w...@iguana.be; lgirdw...@gmail.com; g...@slimlogic.co.uk; Kristo, Tero;
> lee.jo...@linaro.org; J, KEERTHY; Ian Lartey
> Subject: Re: [PATCH v2] mfd: DT bindings for the palmas family MFD
> 
> On Tue, 4 Jun 2013 14:11:49 +0530, J Keerthy  wrote:
> > From: Graeme Gregory 
> >
> > Add the various binding files for the palmas family of chips. There
> is
> > a top level MFD binding then a seperate binding for regulators IP
> blocks on chips.
> >
> > Signed-off-by: Graeme Gregory 
> > Signed-off-by: J Keerthy 
> > Signed-off-by: Ian Lartey 
> 
> Applied, thanks.
> 

There are couple of minor comments from Stephen. I will post a v3
Of this with an Acked-by: Stephen. Could you please pull that
And drop this?
 
> g.
>

Regards,
Keerthy
 
> > ---
> > Changes from v1:
> > * Corrected ti,ldo6_vibrator property ---> ti,ldo6-vibrator
> > * Added the irq.h header file inclusion for defining type of
> interrupt
> >
> >  Documentation/devicetree/bindings/mfd/palmas.txt   |   49
> +++
> >  .../devicetree/bindings/regulator/palmas-pmic.txt  |   65
> 
> >  2 files changed, 114 insertions(+), 0 deletions(-)  create mode
> > 100644 Documentation/devicetree/bindings/mfd/palmas.txt
> >  create mode 100644
> > Documentation/devicetree/bindings/regulator/palmas-pmic.txt
> >
> > diff --git a/Documentation/devicetree/bindings/mfd/palmas.txt
> > b/Documentation/devicetree/bindings/mfd/palmas.txt
> > new file mode 100644
> > index 000..c6c5e78
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/mfd/palmas.txt
> > @@ -0,0 +1,49 @@
> > +* palmas device tree bindings
> > +
> > +The TI palmas family current members :-
> > +twl6035 (palmas)
> > +twl6037 (palmas)
> > +tps65913 (palmas)
> > +tps65914 (palmas)
> > +
> > +Required properties:
> > +- compatible : Should be from the list
> > +  ti,twl6035
> > +  ti,twl6036
> > +  ti,twl6037
> > +  ti,tps65913
> > +  ti,tps65914
> > +  ti,tps80036
> > +and also the generic series names
> > +  ti,palmas
> > +- interrupt-controller : palmas has its own internal IRQs
> > +- #interrupt-cells : should be set to 2 for IRQ number and flags
> > +  The first cell is the IRQ number.
> > +  The second cell is the flags, encoded as the trigger masks from
> > +  Documentation/devicetree/bindings/interrupts.txt
> > +- interrupt-parent : The parent interrupt controller.
> > +
> > +Optional properties:
> > +  ti,mux_padX : set the pad register X (1-2) to the correct muxing
> for the
> > +   hardware, if not set will use muxing in OTP.
> > +
> > +Example:
> > +
> > +palmas {
> > +   compatible = "ti,twl6035", "ti,palmas";
> > +   reg = <0x48>
> > +   interrupt-parent = <>;
> > +   interrupt-controller;
> > +   #interrupt-cells = <2>;
> > +
> > +   ti,mux-pad1 = <0>;
> > +   ti,mux-pad2 = <0>;
> > +
> > +   #address-cells = <1>;
> > +   #size-cells = <0>;
> > +
> > +   pmic {
> > +   compatible = "ti,twl6035-pmic", "ti,palmas-pmic";
> > +   
> > +   };
> > +}
> > diff --git
> > a/Documentation/devicetree/bindings/regulator/palmas-pmic.txt
> > b/Documentation/devicetree/bindings/regulator/palmas-pmic.txt
> > new file mode 100644
> > index 000..a0ccdf2
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/regulator/palmas-pmic.txt
> > @@ -0,0 +1,65 @@
> > +* palmas regulator IP block devicetree bindings
> > +
> > +Required properties:
> > +- compatible : Should be from the list
> > +  ti,twl6035-pmic
> > +  ti,twl6036-pmic
> > +  ti,twl6037-pmic
> > +  ti,tps65913-pmic
> > +  ti,tps65914-pmic
> > +and also the generic series names
> > +  ti,palmas-pmic
> > +
> > +Optional properties:
> > +- ti,ldo6-vibrator : ldo6 is in vibrator mode
> > +
> > +Optional nodes:
> > +- regulators : should contain the constrains and init information
> for the
> > +  regulators. It should contain a subnode per regulator from
> the
> > +  list.
> > +  For ti,palmas-pmic - smps12, smps123, smps3 depending on
> OTP,
> > +  smps45, smps457, smps7 depending on varient, smps6,
> smps[8-10],
> > +  ldo[1-9], ldoln, ldousb
> > +
> > +  optional chip specific regulator fields :-
> > +  ti,warm-reset - maintain voltage during warm
> reset(boolean)
> > +  ti,roof-floor - control voltage selection by pin(boolean)
> > +  ti,sleep-mode - mode to adopt in pmic sleep 0 - off, 1 -
> auto,
> > +  2 - eco, 3 - forced pwm
> > +  ti,tstep - slope control 0 - Jump, 1 10mV/us, 2 5mV/us, 3
> 2.5mV/us
> > +  ti,smps-range - OTP has the wrong range set for the
> hardware so

RE: [PATCH v2] mfd: DT bindings for the palmas family MFD

2013-06-05 Thread J, KEERTHY

Hi Stephen,

Thanks for the quick review.

> -Original Message-
> From: Stephen Warren [mailto:swar...@wwwdotorg.org]
> Sent: Wednesday, June 05, 2013 10:44 PM
> To: J, KEERTHY
> Cc: linux-kernel@vger.kernel.org; linux-...@vger.kernel.org;
> devicetree-disc...@lists.ozlabs.org;
> broo...@opensource.wolfsonmicro.com; rob.herr...@calxeda.com;
> r...@landley.net; sa...@linux.intel.com; w...@iguana.be;
> lgirdw...@gmail.com; g...@slimlogic.co.uk; Kristo, Tero;
> lee.jo...@linaro.org; Ian Lartey
> Subject: Re: [PATCH v2] mfd: DT bindings for the palmas family MFD
> 
> On 06/04/2013 02:41 AM, J Keerthy wrote:
> > From: Graeme Gregory 
> >
> > Add the various binding files for the palmas family of chips. There
> is
> > a top level MFD binding then a seperate binding for regulators IP
> blocks on chips.
> 
> > diff --git a/Documentation/devicetree/bindings/mfd/palmas.txt
> > b/Documentation/devicetree/bindings/mfd/palmas.txt
> 
> > +Optional properties:
> > +  ti,mux_padX : set the pad register X (1-2) to the correct muxing
> for the
> > +   hardware, if not set will use muxing in OTP.
> > +
> > +Example:
> ...
> > +   ti,mux-pad1 = <0>;
> > +   ti,mux-pad2 = <0>;
> 
> Use of - vs. _ is inconsistent there. It should be -.
> 

Oops. I will fix this.

> > diff --git
> > a/Documentation/devicetree/bindings/regulator/palmas-pmic.txt
> > b/Documentation/devicetree/bindings/regulator/palmas-pmic.txt
> 
> > +Optional nodes:
> > +- regulators : should contain the constrains and init information
> for the
> > +  regulators. It should contain a subnode per regulator from
> the
> > +  list.
> 
> I would re-phrase that as:
> 
> Must contain a sub-node per regulator from the list below. Each sub-
> node should contain the constraints and initialization information for
> that regulator. See regulator.txt for a description of standard
> properties for these sub-nodes. Additional custom properties  are
> listed below.
> 
> > +  For ti,palmas-pmic - smps12, smps123, smps3 depending on
> OTP,
> > +  smps45, smps457, smps7 depending on varient, smps6,
> > +smps[8-10],
> 
> typo: s/varient/variant/.

I will fix this.

> 
> > +  ldo[1-9], ldoln, ldousb
> 
> nit: s/$/./ ?
> 

Ok.

> > +
> > +  optional chip specific regulator fields :-
> 
> Perhaps "Optional sub-node properties:"?

Ok.

> 
> > +pmic {
> > +   compatible = "ti,twl6035-pmic", "ti,palmas-pmic";
> > +   interrupt-parent = <>;
> > +   interrupts = <14 IRQ_TYPE_NONE>;
> > +   interrupt-name = "short-irq";
> 
> If those are required, shouldn't they be listed in a "Required
> properties" section above? In particular, the order of entries in the
> interrupts property must be defined, as well as the expected nameds in
> the interrupt-name property.
> 
> Oh, and it's interrupt-names not interrupt-name.

Ok.

> 
> Oh, one question though: How does the regulator driver determine the
> register address of the regulator sub-device within the overall PMIC?
> Presumably if these are pluggable independent modules, that could
> change depending on which overall chip the PMIC device is plugged into.
> don't you need a reg property to specify that?

The variants have identical register addresses. These are not pluggable
Independent modules. All the variants come with all the regulators
Listed above in general. The driver today has a statically defined
Array of all the above mentioned regulators with their addresses.
 
drivers/regulator/palmas-regulator.c

Line 38.

> 
> Aside from those comments, this all looks reasonable to me.

Once again thanks for the comprehensive feedback.

Regards,
Keerthy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH V2] vhost_net: clear msg.control for non-zerocopy case during tx

2013-06-05 Thread Jason Wang

When we decide not use zero-copy, msg.control should be set to NULL otherwise
macvtap/tap may set zerocopy callbacks which may decrease the kref of ubufs
wrongly.

Bug were introduced by commit cedb9bdce099206290a2bdd02ce47a7b253b6a84
(vhost-net: skip head management if no outstanding).

This solves the following warnings:

WARNING: at include/linux/kref.h:47 handle_tx+0x477/0x4b0 [vhost_net]()
Modules linked in: vhost_net macvtap macvlan tun nfsd exportfs bridge stp llc 
openvswitch kvm_amd kvm bnx2 megaraid_sas [last unloaded: tun]
CPU: 5 PID: 8670 Comm: vhost-8668 Not tainted 3.10.0-rc2+ #1566
Hardware name: Dell Inc. PowerEdge R715/00XHKG, BIOS 1.5.2 04/19/2011
a0198323 88007c9ebd08 81796b73 88007c9ebd48
8103d66b 7b773e20 8800779f 8800779f43f0
8800779f8418 015c 0062 88007c9ebd58
Call Trace:
[] dump_stack+0x19/0x1e
[] warn_slowpath_common+0x6b/0xa0
[] warn_slowpath_null+0x15/0x20
[] handle_tx+0x477/0x4b0 [vhost_net]
[] handle_tx_kick+0x10/0x20 [vhost_net]
[] vhost_worker+0xfe/0x1a0 [vhost_net]
[] ? vhost_attach_cgroups_work+0x30/0x30 [vhost_net]
[] ? vhost_attach_cgroups_work+0x30/0x30 [vhost_net]
[] kthread+0xc6/0xd0
[] ? kthread_freezable_should_stop+0x70/0x70
[] ret_from_fork+0x7c/0xb0
[] ? kthread_freezable_should_stop+0x70/0x70

Acked-by: Michael S. Tsirkin 
Signed-off-by: Jason Wang 
---
The patch is needed for -stable.

Changes from v1:
- code style issue fix
---
 drivers/vhost/net.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2b51e23..518622d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -436,6 +436,8 @@ static void handle_tx(struct vhost_net *net)
kref_get(>kref);
}
nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
+   } else {
+   msg.msg_control = NULL;
}
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(NULL, sock, , len);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 5/5] net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg

2013-06-05 Thread Stephen Rothwell

Hi Anton,

On Thu, 6 Jun 2013 13:01:05 +1000 Anton Blanchard  wrote:
>
> > This is causing a regression on 64bit powerpc with 32bit usermode.
> > When I hit userspace, udev is broken and I suspect all networking is
> > broken as well.
> > 
> > Can we please revert 1be374a0518a288147c6a7398792583200a67261
> > upstream?
> > 
> > Found via bisect.
> 
> Doesn't this patch break compat_sys_sendmsg and compat_sys_recvmsg?
> We'd need to move the guts of sys_* into compat_sys_* to fix it.

What you really need is a set of common functions that the sys_ and
compat_sys_ functions can call - with the sys_ funtions forbidding
MSG_CMSG_COMPAT and the compat_sys_ functions setting it.

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/


pgptCj4qRdksw.pgp
Description: PGP signature

Re: [PATCH] vhost_net: clear msg.control for non-zerocopy case during tx

2013-06-05 Thread Jason Wang

On 06/05/2013 09:44 PM, Sergei Shtylyov wrote:
> Hello.
>
> On 05-06-2013 11:40, Jason Wang wrote:
>
>> When we decide not use zero-copy, msg.control should be set to NULL
>> otherwise
>> macvtap/tap may set zerocopy callbacks which may decrease the kref of
>> ubufs
>> wrongly.
>
>> Bug were introduced by commit cedb9bdce099206290a2bdd02ce47a7b253b6a84
>> (vhost-net: skip head management if no outstanding).
>
>> This solves the following warnings:
>
>> WARNING: at include/linux/kref.h:47 handle_tx+0x477/0x4b0 [vhost_net]()
>> Modules linked in: vhost_net macvtap macvlan tun nfsd exportfs bridge
>> stp llc openvswitch kvm_amd kvm bnx2 megaraid_sas [last unloaded: tun]
>> CPU: 5 PID: 8670 Comm: vhost-8668 Not tainted 3.10.0-rc2+ #1566
>> Hardware name: Dell Inc. PowerEdge R715/00XHKG, BIOS 1.5.2 04/19/2011
>> a0198323 88007c9ebd08 81796b73 88007c9ebd48
>> 8103d66b 7b773e20 8800779f 8800779f43f0
>> 8800779f8418 015c 0062 88007c9ebd58
>> Call Trace:
>> [] dump_stack+0x19/0x1e
>> [] warn_slowpath_common+0x6b/0xa0
>> [] warn_slowpath_null+0x15/0x20
>> [] handle_tx+0x477/0x4b0 [vhost_net]
>> [] handle_tx_kick+0x10/0x20 [vhost_net]
>> [] vhost_worker+0xfe/0x1a0 [vhost_net]
>> [] ? vhost_attach_cgroups_work+0x30/0x30 [vhost_net]
>> [] ? vhost_attach_cgroups_work+0x30/0x30 [vhost_net]
>> [] kthread+0xc6/0xd0
>> [] ? kthread_freezable_should_stop+0x70/0x70
>> [] ret_from_fork+0x7c/0xb0
>> [] ? kthread_freezable_should_stop+0x70/0x70
>
>> Signed-off-by: Jason Wang 
>> ---
>>   drivers/vhost/net.c |3 ++-
>>   1 files changed, 2 insertions(+), 1 deletions(-)
>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index 2b51e23..b07d96b 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -436,7 +436,8 @@ static void handle_tx(struct vhost_net *net)
>>   kref_get(>kref);
>>   }
>>   nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
>> -}
>> +} else
>
>You have to use {} on the *else* branch if you have it of the *if*
> branch (and vice versa), according to Documentation/CodingStyle.

checkpatch.pl didn't complain this, will send v2.

Thanks
>
>> +msg.msg_control = NULL;
>
> WBR, Sergei
>
> -- 
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: stop_machine lockup issue in 3.9.y.

2013-06-05 Thread Eric Dumazet

On Wed, 2013-06-05 at 20:14 -0700, Tejun Heo wrote:

> 
> Ah, so, that's why it's showing up now.  We probably have had the same
> issue all along but it used to be masked by the softirq limiting.  Do
> you care to revive the 10 iterations limit so that it's limited by
> both the count and timing?  We do wanna find out why softirq is
> spinning indefinitely tho.

Yes, no problem, I can do that.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [net-next rfc V3 8/9] macvtap: add TUNSETQUEUE ioctl

2013-06-05 Thread Jason Wang

On 06/05/2013 06:59 PM, Michael S. Tsirkin wrote:
> On Wed, Jun 05, 2013 at 02:36:31PM +0800, Jason Wang wrote:
>> This patch adds TUNSETQUEUE ioctl to let userspace can temporarily disable or
>> enable a queue of macvtap. This is used to be compatible at API layer of 
>> tuntap
>> to simplify the userspace to manage the queues. This is done through 
>> introducing
>> a linked list to track all taps while using vlan->taps array to only track
>> active taps.
>>
>> Signed-off-by: Jason Wang 
>> ---
>>  drivers/net/macvtap.c  |  133 
>> +++-
>>  include/linux/if_macvlan.h |4 +
>>  2 files changed, 122 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
>> index 14764cc..355e6ad 100644
>> --- a/drivers/net/macvtap.c
>> +++ b/drivers/net/macvtap.c
>> @@ -45,6 +45,8 @@ struct macvtap_queue {
>>  struct file *file;
>>  unsigned int flags;
>>  u16 queue_index;
>> +bool enabled;
>> +struct list_head next;
>>  };
>>  
>>  static struct proto macvtap_proto = {
>> @@ -85,14 +87,37 @@ static const struct proto_ops macvtap_socket_ops;
>>   */
>>  static DEFINE_SPINLOCK(macvtap_lock);
>>  
>> -static int macvtap_set_queue(struct net_device *dev, struct file *file,
>> +static int macvtap_enable_queue(struct net_device *dev, struct file *file,
>>  struct macvtap_queue *q)
>>  {
>>  struct macvlan_dev *vlan = netdev_priv(dev);
>> +int err = -EINVAL;
>> +
>> +spin_lock(_lock);
>> +
>> +if (q->enabled)
>> +goto out;
>> +
>> +err = 0;
>> +rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
>> +q->queue_index = vlan->numvtaps;
>> +q->enabled = true;
>> +
>> +vlan->numvtaps++;
>> +vlan->numdisabled--;
>> +out:
>> +spin_unlock(_lock);
>> +return err;
>> +}
>> +
>> +static int macvtap_set_queue(struct net_device *dev, struct file *file,
>> + struct macvtap_queue *q)
>> +{
>> +struct macvlan_dev *vlan = netdev_priv(dev);
>>  int err = -EBUSY;
>>  
>>  spin_lock(_lock);
>> -if (vlan->numvtaps == MAX_MACVTAP_QUEUES)
>> +if (vlan->numvtaps + vlan->numdisabled == MAX_MACVTAP_QUEUES)
>>  goto out;
>>  
>>  err = 0;
>> @@ -102,7 +127,9 @@ static int macvtap_set_queue(struct net_device *dev, 
>> struct file *file,
>>  
>>  q->file = file;
>>  q->queue_index = vlan->numvtaps;
>> +q->enabled = true;
>>  file->private_data = q;
>> +list_add_tail(>next, >tap_link);
>>  
>>  vlan->numvtaps++;
>>  
>> @@ -111,6 +138,38 @@ out:
>>  return err;
>>  }
>>  
>> +static int macvtap_disable_queue(struct macvtap_queue *q)
>> +{
>> +struct macvlan_dev *vlan;
>> +struct macvtap_queue *nq;
>> +int err = 0;
>> +
>> +spin_lock(_lock);
>> +vlan = rcu_dereference_protected(q->vlan,
>> + lockdep_is_held(_lock));
>> +
>> +if (!q->enabled) {
>> +err = -EINVAL;
>> +goto out;
>> +}
>> +
>> +if (vlan) {
>> +int index = q->queue_index;
>> +nq = rcu_dereference_protected(vlan->taps[vlan->numvtaps - 1],
>> +   lockdep_is_held(_lock));
>> +nq->queue_index = index;
>> +
>> +rcu_assign_pointer(vlan->taps[index], nq);
>> +RCU_INIT_POINTER(vlan->taps[vlan->numvtaps - 1], NULL);
>> +q->enabled = false;
>> +
>> +vlan->numvtaps--;
>> +vlan->numdisabled++;
>> +}
>> +out:
>> +spin_unlock(_lock);
>> +return err;
>> +}
>>  /*
>>   * The file owning the queue got closed, give up both
>>   * the reference that the files holds as well as the
>> @@ -128,18 +187,24 @@ static void macvtap_put_queue(struct macvtap_queue *q)
>>  vlan = rcu_dereference_protected(q->vlan,
>>   lockdep_is_held(_lock));
>>  if (vlan) {
>> +int numvtaps = vlan->numvtaps;
>>  int index = q->queue_index;
>> -BUG_ON(index >= vlan->numvtaps);
>>  
>> -nq = rcu_dereference_protected(vlan->taps[vlan->numvtaps - 1],
>> -   lockdep_is_held(_lock));
>> -rcu_assign_pointer(vlan->taps[index], nq);
>> -nq->queue_index = index;
>> +if (q->enabled) {
>> +BUG_ON(index >= vlan->numvtaps);
>> +nq = rcu_dereference_protected(vlan->taps[numvtaps - 1],
>> +lockdep_is_held(_lock));
>> +rcu_assign_pointer(vlan->taps[index], nq);
> Do we really need these tricks?
> Can't we call macvtap_disable_queue and then only handle disable queues
> here?

We could, will do it.
>
>> +nq->queue_index = index;
>> +
>> +RCU_INIT_POINTER(vlan->taps[numvtaps - 1], NULL);
>> +vlan->numvtaps--;
>> +

Re: stop_machine lockup issue in 3.9.y.

2013-06-05 Thread Tejun Heo

Hello, Eric.

On Wed, Jun 05, 2013 at 06:34:52PM -0700, Eric Dumazet wrote:
> > Ingo, Thomas, we're seeing a stop_machine hanging because
> > 
> > * All other CPUs entered IRQ disabled stage.  Jiffies is not being
> >   updated.
> > 
> > * The last CPU get caught up executing softirq indefinitely.  As
> >   jiffies doesn't get updated, it never breaks out of softirq
> >   handling.  This is a deadlock.  This CPU won't break out of softirq
> >   handling unless jiffies is updated and other CPUs can't do anything
> >   until this CPU enters the same stop_machine stage.
> > 
> > Ben found out that breaking out of softirq handling after certain
> > number of repetitions makes the issue go away, which isn't a proper
> > fix but we might want anyway.  What do you guys think?
> > 
> 
> Interesting
> 
> Before 3.9 and commit c10d73671ad30f5469
> ("softirq: reduce latencies") we used to limit the __do_softirq() loop
> to 10.

Ah, so, that's why it's showing up now.  We probably have had the same
issue all along but it used to be masked by the softirq limiting.  Do
you care to revive the 10 iterations limit so that it's limited by
both the count and timing?  We do wanna find out why softirq is
spinning indefinitely tho.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [net-next rfc V3 7/9] macvtap: allow TUNSETIFF to create multiqueue device

2013-06-05 Thread Jason Wang

On 06/05/2013 06:43 PM, Michael S. Tsirkin wrote:
> On Wed, Jun 05, 2013 at 02:36:30PM +0800, Jason Wang wrote:
>> Though the queue were in fact created by open(), we still need to add this 
>> check
>> to be compatible with tuntap which can let mgmt software use a single API to
>> manage queues. This patch only validates the device name and moves the 
>> TUNSETIFF
>> to a helper.
>>
>> Signed-off-by: Jason Wang 
> The patch is OK, the description is confusing.
> What you mean is simply:
>
>   Allow IFF_MULTI_QUEUE in TUNSETIFF for macvtap, to match
>   tun behaviour.
>
> And if you put it like this, I would say make this
> the last patch in the series, so userspace
> can use IFF_MULTI_QUEUE to detect new versus old
> behaviour.

Make sense, thanks.
>
>> ---
>>  drivers/net/macvtap.c |   51 
>> ++--
>>  1 files changed, 40 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
>> index 5ccba99..14764cc 100644
>> --- a/drivers/net/macvtap.c
>> +++ b/drivers/net/macvtap.c
>> @@ -869,6 +869,7 @@ out:
>>  return ret;
>>  }
>>  
>> +
>>  static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q)
>>  {
>>  struct macvlan_dev *vlan;
> Please don't.

Ok.
>
>> @@ -887,6 +888,44 @@ static void macvtap_put_vlan(struct macvlan_dev *vlan)
>>  dev_put(vlan->dev);
>>  }
>>  
>> +static int macvtap_set_iff(struct file *file, struct ifreq __user *ifr_u)
>> +{
>> +struct macvtap_queue *q = file->private_data;
>> +struct net *net = current->nsproxy->net_ns;
>> +struct inode *inode = file_inode(file);
>> +struct net_device *dev, *dev2;
>> +struct ifreq ifr;
>> +
>> +if (copy_from_user(, ifr_u, sizeof(struct ifreq)))
>> +return -EFAULT;
>> +
>> +/* To keep the same behavior of tuntap, validate ifr_name */
> So I'm not sure - why is it important to validate ifr_name here?
> We ignore the name for all other flags - why is IFF_MULTI_QUEUE
> special?

It raises another question, why not validate ifname like tuntap? We
should warn userspace about their error, otherwise they may create
queues on the wrong device. In fact I want validate for both, but keep
the behaviour w/o IFF_MULTI_QUEUE for backward compatibility.
>
>> +if (ifr.ifr_flags & IFF_MULTI_QUEUE) {
>> +dev = __dev_get_by_name(net, ifr.ifr_name);
>> +if (!dev)
>> +return -EINVAL;
>> +
>> +dev2 = dev_get_by_macvtap_minor(iminor(inode));
>> +if (!dev2)
>> +return -EINVAL;
>> +
>> +if (dev != dev2) {
>> +dev_put(dev2);
>> +return -EINVAL;
>> +}
>> +
>> +dev_put(dev2);
>> +}
>> +
>> +if ((ifr.ifr_flags & ~(IFF_VNET_HDR | IFF_MULTI_QUEUE)) !=
>> +(IFF_NO_PI | IFF_TAP))
>> +return -EINVAL;
>> +else
>> +q->flags = ifr.ifr_flags;
>> +
>> +return 0;
>> +}
>> +
>>  /*
>>   * provide compatibility with generic tun/tap interface
>>   */
>> @@ -905,17 +944,7 @@ static long macvtap_ioctl(struct file *file, unsigned 
>> int cmd,
>>  
>>  switch (cmd) {
>>  case TUNSETIFF:
>> -/* ignore the name, just look at flags */
> This is actually a useful comment that you've removed.

Will get it back.
>
>> -if (get_user(u, >ifr_flags))
>> -return -EFAULT;
>> -
>> -ret = 0;
>> -if ((u & ~IFF_VNET_HDR) != (IFF_NO_PI | IFF_TAP))
>> -ret = -EINVAL;
>> -else
>> -q->flags = u;
>> -
>> -return ret;
>> +return macvtap_set_iff(file, ifr);
>>  
>>  case TUNGETIFF:
>>  vlan = macvtap_get_vlan(q);
>> -- 
>> 1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 1/2] arch: invoke oom-killer from page fault

2013-06-05 Thread Johannes Weiner

Since '1c0fe6e mm: invoke oom-killer from page fault', page fault
handlers should not directly kill faulting tasks in an out of memory
condition.  Instead, they should be invoking the OOM killer to pick
the right task.  Convert the remaining architectures.

Signed-off-by: Johannes Weiner 
---
 arch/arc/mm/fault.c  | 6 --
 arch/metag/mm/fault.c| 6 --
 arch/mn10300/mm/fault.c  | 7 ---
 arch/openrisc/mm/fault.c | 8 
 arch/score/mm/fault.c| 8 
 arch/tile/mm/fault.c | 8 
 6 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index c0decc1..d5ec60a 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -207,8 +207,10 @@ out_of_memory:
}
up_read(>mmap_sem);
 
-   if (user_mode(regs))
-   do_group_exit(SIGKILL); /* This will never return */
+   if (user_mode(regs)) {
+   pagefault_out_of_memory();
+   return;
+   }
 
goto no_context;
 
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
index 2c75bf7..8fddf46 100644
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -224,8 +224,10 @@ do_sigbus:
 */
 out_of_memory:
up_read(>mmap_sem);
-   if (user_mode(regs))
-   do_group_exit(SIGKILL);
+   if (user_mode(regs)) {
+   pagefault_out_of_memory();
+   return 1;
+   }
 
 no_context:
/* Are we prepared to handle this kernel fault?  */
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index d48a84f..8a2e6de 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -345,9 +345,10 @@ no_context:
  */
 out_of_memory:
up_read(>mmap_sem);
-   printk(KERN_ALERT "VM: killing process %s\n", tsk->comm);
-   if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
-   do_exit(SIGKILL);
+   if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) {
+   pagefault_out_of_memory();
+   return;
+   }
goto no_context;
 
 do_sigbus:
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index e2bfafc..4a41f84 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -267,10 +267,10 @@ out_of_memory:
__asm__ __volatile__("l.nop 1");
 
up_read(>mmap_sem);
-   printk("VM: killing process %s\n", tsk->comm);
-   if (user_mode(regs))
-   do_exit(SIGKILL);
-   goto no_context;
+   if (!user_mode(regs))
+   goto no_context;
+   pagefault_out_of_memory();
+   return;
 
 do_sigbus:
up_read(>mmap_sem);
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 47b600e..6b18fb0 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -172,10 +172,10 @@ out_of_memory:
down_read(>mmap_sem);
goto survive;
}
-   printk("VM: killing process %s\n", tsk->comm);
-   if (user_mode(regs))
-   do_group_exit(SIGKILL);
-   goto no_context;
+   if (!user_mode(regs))
+   goto no_context;
+   pagefault_out_of_memory();
+   return;
 
 do_sigbus:
up_read(>mmap_sem);
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 3d2b81c..f7f99f9 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -573,10 +573,10 @@ out_of_memory:
down_read(>mmap_sem);
goto survive;
}
-   pr_alert("VM: killing process %s\n", tsk->comm);
-   if (!is_kernel_mode)
-   do_group_exit(SIGKILL);
-   goto no_context;
+   if (is_kernel_mode)
+   goto no_context;
+   pagefault_out_of_memory();
+   return 0;
 
 do_sigbus:
up_read(>mmap_sem);
-- 
1.8.2.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 2/2] memcg: do not sleep on OOM waitqueue with full charge context

2013-06-05 Thread Johannes Weiner

The memcg OOM handling is incredibly fragile because once a memcg goes
OOM, one task (kernel or userspace) is responsible for resolving the
situation.  Every other task that gets caught trying to charge memory
gets stuck in a waitqueue while potentially holding various filesystem
and mm locks on which the OOM handling task may now deadlock.

Do two things:

1. When OOMing in a system call (buffered IO and friends), invoke the
   OOM killer but just return -ENOMEM, never sleep.  Userspace should
   be able to handle this.

2. When OOMing in a page fault and somebody else is handling the
   situation, do not sleep directly in the charging code.  Instead,
   remember the OOMing memcg in the task struct and then fully unwind
   the page fault stack first before going to sleep.

While reworking the OOM routine, also remove a needless OOM waitqueue
wakeup when invoking the killer.  Only uncharges and limit increases,
things that actually change the memory situation, should do wakeups.

Signed-off-by: Johannes Weiner 
Reviewed-by: Michal Hocko 
---
 include/linux/memcontrol.h |  22 +++
 include/linux/mm.h |   1 +
 include/linux/sched.h  |   6 ++
 mm/ksm.c   |   2 +-
 mm/memcontrol.c| 146 -
 mm/memory.c|  40 +
 mm/oom_kill.c  |   7 ++-
 7 files changed, 154 insertions(+), 70 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c8b1412..8e0f900 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -124,6 +124,15 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec 
*lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p);
+static inline void mem_cgroup_set_userfault(struct task_struct *p)
+{
+   p->memcg_oom.in_userfault = 1;
+}
+static inline void mem_cgroup_clear_userfault(struct task_struct *p)
+{
+   p->memcg_oom.in_userfault = 0;
+}
+bool mem_cgroup_oom_synchronize(void);
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
struct page *newpage);
 
@@ -343,6 +352,19 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct 
task_struct *p)
 {
 }
 
+static inline void mem_cgroup_set_userfault(struct task_struct *p)
+{
+}
+
+static inline void mem_cgroup_clear_userfault(struct task_struct *p)
+{
+}
+
+static inline bool mem_cgroup_oom_synchronize(void)
+{
+   return false;
+}
+
 static inline void mem_cgroup_begin_update_page_stat(struct page *page,
bool *locked, unsigned long *flags)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b87681a..79ee304 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -176,6 +176,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_RETRY_NOWAIT0x10/* Don't drop mmap_sem and wait 
when retrying */
 #define FAULT_FLAG_KILLABLE0x20/* The fault task is in SIGKILL 
killable region */
 #define FAULT_FLAG_TRIED   0x40/* second try */
+#define FAULT_FLAG_KERNEL  0x80/* kernel-triggered fault 
(get_user_pages etc.) */
 
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 08090e6..0659277 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1403,6 +1403,12 @@ struct task_struct {
unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
} memcg_batch;
unsigned int memcg_kmem_skip_account;
+   struct memcg_oom_info {
+   unsigned int in_userfault:1;
+   unsigned int in_memcg_oom:1;
+   int wakeups;
+   struct mem_cgroup *wait_on_memcg;
+   } memcg_oom;
 #endif
 #ifdef CONFIG_UPROBES
struct uprobe_task *utask;
diff --git a/mm/ksm.c b/mm/ksm.c
index b6afe0c..9dff93b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -372,7 +372,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned 
long addr)
break;
if (PageKsm(page))
ret = handle_mm_fault(vma->vm_mm, vma, addr,
-   FAULT_FLAG_WRITE);
+   FAULT_FLAG_KERNEL | FAULT_FLAG_WRITE);
else
ret = VM_FAULT_WRITE;
put_page(page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d169a8d..61d3449 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -298,6 +298,7 @@ struct mem_cgroup {
 
booloom_lock;
atomic_tunder_oom;
+   atomic_toom_wakeups;
 
atomic_trefcnt;
 
@@ -2305,6 +2306,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
 
 static void

Re: [PATCH] dma: mmp_pdma: support for getting residual bytes

2013-06-05 Thread Xiang Wang

2013/6/3 Andy Shevchenko :
> On Mon, Jun 3, 2013 at 6:22 AM, Xiang Wang  wrote:
>> 2013/5/31 Andy Shevchenko :
>>> On Fri, May 31, 2013 at 11:21 AM, Xiang Wang  wrote:
 In some of our drivers (e.g. UART) we may stop a running DMA
 before it finishes. So we need to know how many bytes have
 been transferred.
>>>
>>> Couple of comments below.
>>>
 --- a/drivers/dma/mmp_pdma.c
 +++ b/drivers/dma/mmp_pdma.c
>>>
 @@ -589,7 +638,13 @@ static int mmp_pdma_control(struct dma_chan *dchan, 
 enum dma_ctrl_cmd cmd,
 mmp_pdma_free_desc_list(chan, >chain_pending);
 mmp_pdma_free_desc_list(chan, >chain_running);
 spin_unlock_irqrestore(>desc_lock, flags);
 -   chan->idle = true;
 +   chan->status = DMA_SUCCESS;
 +   chan->bytes_residue = 0;
 +   break;
 +   case DMA_PAUSE:
 +   disable_chan(chan->phy);
 +   chan->status = DMA_PAUSED;
 +   chan->bytes_residue = mmp_pdma_get_bytes_residue(chan);
>>>
>>> Does it mean user has to do DMA_PAUSE first to get more or less
>>> accurate residue?
>>> Logically that sound correct, but in general we may allow user to get
>>> approximate residue value of on going transfer.
>
>> Your comment makes sense. But if the user is allowed to query the
>> residue value in real-time, we cannot just return a saved value to
>> him.
>
> Right.
>
>> Why I use a saved value (chan->bytes_residue)?
>> In current mmp pdma driver, a phy channel will be freed after the
>> transmission finishes (chan->phy is set to NULL). So we cannot get the
>> physical channel information after we call DMA_TERMINATE_ALL or DMA
>> finishes itself.
>
> I don't see any contradiction to workflow.
> So, If you call device_tx_status() when transfer is completed or
> aborted you will get 0 as a residue, which is correct.
>
>> That is to say, when the use queries the channel information at these
>> points, the chan->phy is usually NULL.
>
 @@ -637,7 +692,8 @@ static enum dma_status mmp_pdma_tx_status(struct 
 dma_chan *dchan,
 unsigned long flags;

 spin_lock_irqsave(>desc_lock, flags);
 -   ret = dma_cookie_status(dchan, cookie, txstate);
 +   ret = chan->status;
 +   dma_set_residue(txstate, chan->bytes_residue);
 spin_unlock_irqrestore(>desc_lock, flags);
>>>
>>> Besides my patch which removes this spinlock I think the workflow
>>> should be something like
>>>
>>> status = dma_cookie_status()
>>> if status == DMA_SUCCESS or !txstate:
>>> return status
>>>
>>> dma_set_residue()
>>> return status
>>>
>>> Because there is no reason to return residue of successfully finished
>>> transfer. It should be 0.
>
>> There is one exception from my point of view. When we are receiving
>> data from peripheral devices, we usually start a DMA transaction with
>> a target length of 4K for example. When a timed-out event occurs in
>> peripheral device, it will notify DMA controller and DMA controller
>> will send out a End of Receive interrupt (Marvell specific?).
>
> Might be your hardware specifics, in our case we have got a timeout
> interrupt from uart controller.
>
>> In such situation, DMA status is also DMA_SUCCESS. But the residual
>> bytes is not 0 and the user must query it.
>
> Which sounds wrong approach.
>
> P.S. take a look at  drivers/tty/serial/8250/8250_dma.c
Hi, Andy
When peripheral device (e.g. UART) is using DMA controller, we should
have 2 solutions to deal with trailing bytes:
1. Timeout interrupt from UART controller.
In this case, we usually pause DMA and read out data from DMA buffer
in uart irq handler.
2. DMA controller handles trailing bytes for us.
This is the case I mentioned in my previous email. "When a timed-out
event occurs in peripheral device, it will notify DMA controller and
DMA controller will send out a End of Receive interrupt"
I think we should know how many residual bytes in this case even the
DMA status is DMA_SUCCESS.

Thanks!
>
> --
> With Best Regards,
> Andy Shevchenko



--
Regards,
Xiang
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [net-next rfc V3 0/9] Multiqueue API for macvtap

2013-06-05 Thread Jason Wang

On 06/05/2013 06:36 PM, Michael S. Tsirkin wrote:
> On Wed, Jun 05, 2013 at 02:36:23PM +0800, Jason Wang wrote:
>> > Hi all:
>> > 
>> > This series implements a v3 of fully tuntap compatiable API which could be 
>> > used
>> > by userspace to manage multiple macvtap queues. The main parts is to add
>> > TUNSETQUEUE ioctl support for macvtap.
>> > 
>> > Patch 1 - 5 was some tuntap compatibility and misc cleanups.
>> > Patch 6 removes the linear search in macvtap by reshuffling the macvtaps 
>> > array
>> > each time a queue is removed. After this, we could store both enabled and
>> > disabled queues in the same array without introducing extra data structure.
>> > Patch 7 let TUNSETIFF can create multiqueue device, nothing but some check
>> > were added.
>> > Patch 8 implement TUNSETQUEUE ioctl
>> > Patch 9 reports IFF_MULTI_QUEUE to userspace to notify the userspace that 
>> > the
>> > multiqueue API is completed.
>> > 
>> > Flow caches implememtation were missed in this version, since I am doing
>> > rework on the tuntap flow caches. Have some some stress test with both 
>> > netperf
>> > and pktgen.
>> > 
>> > Please review, thanks.
> FYI by netdev rules RFC means "don't apply yet".
>

Ok, let me remove the rfc in next series.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch] memcg: clean up memcg->nodeinfo

2013-06-05 Thread Johannes Weiner

Remove struct mem_cgroup_lru_info and fold its single member, the
variably sized nodeinfo[0], directly into struct mem_cgroup.  This
should make it more obvious why it has to be the last member there.

Also move the comment that's above that special last member below it,
so it is more visible to somebody that considers appending to the
struct mem_cgroup.

Signed-off-by: Johannes Weiner 
---
 mm/memcontrol.c | 21 ++---
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ff7b40d..d169a8d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -187,10 +187,6 @@ struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 };
 
-struct mem_cgroup_lru_info {
-   struct mem_cgroup_per_node *nodeinfo[0];
-};
-
 /*
  * Cgroups above their limits are maintained in a RB-Tree, independent of
  * their hierarchy representation
@@ -384,14 +380,9 @@ struct mem_cgroup {
 #endif
/* when kmem shrinkers can sleep but can't proceed due to context */
struct work_struct kmemcg_shrink_work;
-   /*
-* Per cgroup active and inactive list, similar to the
-* per zone LRU lists.
-*
-* WARNING: This has to be the last element of the struct. Don't
-* add new fields after this point.
-*/
-   struct mem_cgroup_lru_info info;
+
+   struct mem_cgroup_per_node *nodeinfo[0];
+   /* WARNING: nodeinfo has to be the last member here */
 };
 
 static size_t memcg_size(void)
@@ -777,7 +768,7 @@ static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
 {
VM_BUG_ON((unsigned)nid >= nr_node_ids);
-   return >info.nodeinfo[nid]->zoneinfo[zid];
+   return >nodeinfo[nid]->zoneinfo[zid];
 }
 
 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
@@ -6595,13 +6586,13 @@ static int alloc_mem_cgroup_per_zone_info(struct 
mem_cgroup *memcg, int node)
mz->on_tree = false;
mz->memcg = memcg;
}
-   memcg->info.nodeinfo[node] = pn;
+   memcg->nodeinfo[node] = pn;
return 0;
 }
 
 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
-   kfree(memcg->info.nodeinfo[node]);
+   kfree(memcg->nodeinfo[node]);
 }
 
 static struct mem_cgroup *mem_cgroup_alloc(void)
-- 
1.8.2.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH RFC v2 1/3] drivers/platform/x86: add cpu physically hotplug driver

2013-06-05 Thread li guang

在 2013-06-06四的 10:24 +0800，Gu Zheng写道：
> On 06/06/2013 09:40 AM, liguang wrote:
> 
> > this driver will support cpu phyical add/removal automatically
> > after online/offline. if cpu hotpluged, cpu will not
> > online automatically, and for cpu offline, we try to
> > do actually eject if allowed for cpu like
> > "echo 1 > /sys/bus/acpi/devices/LNXCPU\:0X/eject"
> > this "echo ..." is only present for recent kernel
> > (sorry, can't figure out since when), for a little
> > older kernel, there's not such approach AFAICS.
> > 
> > Signed-off-by: liguang 
> > ---
> >  drivers/platform/x86/Kconfig  |8 
> >  drivers/platform/x86/Makefile |1 +
> >  drivers/platform/x86/cpu_physic_hotplug.c |   60 
> > +
> >  3 files changed, 69 insertions(+), 0 deletions(-)
> >  create mode 100644 drivers/platform/x86/cpu_physic_hotplug.c
> > 
> > diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
> > index 8577261..39b2392 100644
> > --- a/drivers/platform/x86/Kconfig
> > +++ b/drivers/platform/x86/Kconfig
> > @@ -789,4 +789,12 @@ config PVPANIC
> >   a paravirtualized device provided by QEMU; it lets a virtual machine
> >   (guest) communicate panic events to the host.
> >  
> > +config QEMU_CPU_PHYSIC_HOTPLUG
> > +   tristate "physically add/remove cpu after cpu onlined/offlined"
> > +   depends on ACPI_HOTPLUG_CPU
> > +   ---help---
> > + This driver will support physically remove a cpu after
> > + it offlined for QEMU automatically. someone may require this feature
> > + to do a physically removal for a cpu.
> > +
> >  endif # X86_PLATFORM_DEVICES
> > diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
> > index ef0ec74..2e669b0 100644
> > --- a/drivers/platform/x86/Makefile
> > +++ b/drivers/platform/x86/Makefile
> > @@ -53,3 +53,4 @@ obj-$(CONFIG_APPLE_GMUX)  += apple-gmux.o
> >  obj-$(CONFIG_CHROMEOS_LAPTOP)  += chromeos_laptop.o
> >  
> >  obj-$(CONFIG_PVPANIC)   += pvpanic.o
> > +obj-$(CONFIG_QEMU_CPU_PHYSIC_HOTPLUG)  += cpu_physic_hotplug.o
> > diff --git a/drivers/platform/x86/cpu_physic_hotplug.c 
> > b/drivers/platform/x86/cpu_physic_hotplug.c
> > new file mode 100644
> > index 000..a52c042
> > --- /dev/null
> > +++ b/drivers/platform/x86/cpu_physic_hotplug.c
> > @@ -0,0 +1,60 @@
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +MODULE_AUTHOR("Li Guang");
> > +MODULE_DESCRIPTION("CPU physically hot-plug/unplug Driver");
> > +MODULE_LICENSE("GPL");
> > +
> > +static int cpu_logic_hotplug_notify(struct notifier_block *nfb,
> > +   unsigned long action, void *hcpu)
> > +{
> > +   unsigned int cpu = (unsigned long)hcpu;
> > +   struct acpi_processor *pr = per_cpu(processors, cpu);
> > +
> > +   if (pr) {
> > +   switch (action) {
> > +   case CPU_ONLINE:
> > +   break;
> > +   case CPU_DEAD:
> > +   break;
> > +   default:
> > +   break;
> > +   }
> > +   }
> > +   return NOTIFY_OK;
> > +}
> > +
> > +static struct notifier_block cpu_logic_hotplug_notifier =
> > +{
> > +   .notifier_call = cpu_logic_hotplug_notify,
> > +};
> > +
> > +static int cpu_physic_hotplug_notify(struct notifier_block *nfb,
> > +unsigned char *s)
> > +{
> > +}
> 
> Hi guang,
>   Maybe you need to define the callback function in the right format at 
> the beginning,
> if so, no need to correct it later.:)
> 

right,

Thanks!

> 
> > +
> > +static struct notifier_block cpu_physic_hotplug_notifier =
> > +{
> > +   .notifier_call = cpu_physic_hotplug_notify,
> > +};
> > +
> > +static int __init cpu_qemu_hotplug_init(void)
> > +{
> > +   register_hotcpu_notifier(_logic_hotplug_notifier);
> > +   register_ec_gpe_notifier(_physic_hotplug_notifier);
> 
> 
> As the [PATCH 2/3] has no dependence on this one, so you can set [PATCH 2/3] 
> to [PATCH 1/3] and this one
> to [PATCH 2/3]. Then you can use the xxx_ec_space_notifier directly here.
> 
> > +   return 0;
> > +}
> > +
> > +static void __exit cpu_qemu_hotplug_exit(void)
> > +{
> > +   unregister_hotcpu_notifier(_logic_hotplug_notifier);
> > +   unregister_ec_gpe_notifier(_physic_hotplug_notifier);
> > +}
> > +
> > +module_init(cpu_qemu_hotplug_init);
> > +module_exit(cpu_qemu_hotplug_exit);
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 5/5] net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg

2013-06-05 Thread Anton Blanchard

Hi,

> This is causing a regression on 64bit powerpc with 32bit usermode.
> When I hit userspace, udev is broken and I suspect all networking is
> broken as well.
> 
> Can we please revert 1be374a0518a288147c6a7398792583200a67261
> upstream?
> 
> Found via bisect.

Doesn't this patch break compat_sys_sendmsg and compat_sys_recvmsg?
We'd need to move the guts of sys_* into compat_sys_* to fix it.

Anton
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] [RFC]Watchdog:core: constant pinging until userspace timesout when delay very less

2013-06-05 Thread anish singh

Hello Wim Van,
Can you look into below?

On Wed, Jun 5, 2013 at 8:39 AM, anish singh  wrote:
> Hello Wim Van Sabroeck,
> Can I get your inputs on this?
>
> On Tue, Jun 4, 2013 at 8:39 AM, anish singh  
> wrote:
>> On Tue, Jun 4, 2013 at 3:55 AM, Guenter Roeck  wrote:
>>> On Mon, Jun 03, 2013 at 10:23:04PM +0530, anish singh wrote:
 On Mon, Jun 3, 2013 at 8:57 PM, Guenter Roeck  wrote:
 > On Sun, Jun 02, 2013 at 03:43:07PM +0530, anish kumar wrote:
 >> Certain watchdog drivers use a timer to keep kicking the watchdog at
 >> a rate of 0.5s (HZ/2) untill userspace times out.They do this as
 >> we can't guarantee that watchdog will be pinged fast enough
 >> for all system loads, especially if timeout is configured for
 >> less than or equal to 1 second(basically small values).
 >>
 >> As suggested by Wim Van Sebroeck & Guenter Roeck we should
 >> add this functionality of individual watchdog drivers in the core
 >> watchdog core.
 >>
 >> Signed-off-by: anish kumar 
 >
 > Not exactly what I had in mind. My idea was to enable the softdog only if
 > the hardware watchdog's maximum timeout was low (say, less than a couple
 > of minutes), and if a timeout larger than its maximum value was 
 > configured.

 watchdog_timeout_invalid wouldn't this check will fail if the user space 
 tries
 to set maximum timeout more that what driver can support?It would work
 for pika_wdt.c as it is old watchdog driver and doesn't register with 
 watchdog
 framwork but new drivers has to pass this api.

 OR

 Do you want to remove this check and go as explained by you?I would
 favour this approach though.

>>> One would still have a check, but the enforced limits would no longer be
>>> the driver limits, but larger limits implemented in the watchdog core.
>> How much larger would be the big question here?Should it be configurable
>> property(sysfs?) or some hardcoding based on existing drivers?
>>
>> Before going for next patch, it would be better for me to wait for some
>> more comments.
>>>
 > In that case, I would have set the hardware watchdog to its maximum value
 > and use the softdog to ping it at a rate of, say, 50% of this maximum.
 >
 > If userspace would not ping the watchdog within its configured value,
 > I would stop pinging the hardware watchdog and let it time out.

 One more question.Why is the return value of watchdog_ping int? Anyway
 we discard it.
>>>
>>> I can not answer that question.
>>>
>>> Guenter
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 5/5] net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg

2013-06-05 Thread Michael Neuling

On Thu, May 23, 2013 at 7:07 AM, Andy Lutomirski  wrote:
> MSG_CMSG_COMPAT is (AFAIK) not intended to be part of the API --
> it's a hack that steals a bit to indicate to other networking code
> that a compat entry was used.  So don't allow it from a non-compat
> syscall.

Dave & Linus

This is causing a regression on 64bit powerpc with 32bit usermode.
When I hit userspace, udev is broken and I suspect all networking is
broken as well.

Can we please revert 1be374a0518a288147c6a7398792583200a67261 upstream?

Found via bisect.

Mikey

>
> This prevents an oops when running this code:
>
> int main()
> {
> int s;
> struct sockaddr_in addr;
> struct msghdr *hdr;
>
> char *highpage = mmap((void*)(TASK_SIZE_MAX - 4096), 4096,
>   PROT_READ | PROT_WRITE,
>   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
> if (highpage == MAP_FAILED)
> err(1, "mmap");
>
> s = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
> if (s == -1)
> err(1, "socket");
>
> addr.sin_family = AF_INET;
> addr.sin_port = htons(1);
> addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);This is upster

> if (connect(s, (struct sockaddr*), sizeof(addr)) != 0)
> err(1, "connect");
>
> void *evil = highpage + 4096 - COMPAT_MSGHDR_SIZE;
> printf("Evil address is %p\n", evil);
>
> if (syscall(__NR_sendmmsg, s, evil, 1, MSG_CMSG_COMPAT) < 0)
> err(1, "sendmmsg");
>
> return 0;
> }
>
> Cc: David S. Miller 
> Signed-off-by: Andy Lutomirski 
> ---
>  net/socket.c | 33 +++--
>  1 file changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/net/socket.c b/net/socket.c
> index 88f759a..0e16888 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -2097,8 +2097,12 @@ SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user 
> *, msg, unsigned int, fla
>  {
> int fput_needed, err;
> struct msghdr msg_sys;
> -   struct socket *sock = sockfd_lookup_light(fd, , _needed);
> +   struct socket *sock;
> +
> +   if (flags & MSG_CMSG_COMPAT)
> +   return -EINVAL;
>
> +   sock = sockfd_lookup_light(fd, , _needed);
> if (!sock)
> goto out;
>
> @@ -2171,6 +2175,8 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, 
> unsigned int vlen,
>  SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
> unsigned int, vlen, unsigned int, flags)
>  {
> +   if (flags & MSG_CMSG_COMPAT)
> +   return -EINVAL;
> return __sys_sendmmsg(fd, mmsg, vlen, flags);
>  }
>
> @@ -2271,8 +2277,12 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user 
> *, msg,
>  {
> int fput_needed, err;
> struct msghdr msg_sys;
> -   struct socket *sock = sockfd_lookup_light(fd, , _needed);
> +   struct socket *sock;
> +
> +   if (flags & MSG_CMSG_COMPAT)
> +   return -EINVAL;
>
> +   sock = sockfd_lookup_light(fd, , _needed);
> if (!sock)
> goto out;
>
> @@ -2397,6 +2407,9 @@ SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr 
> __user *, mmsg,
> int datagrams;
> struct timespec timeout_sys;
>
> +   if (flags & MSG_CMSG_COMPAT)
> +   return -EINVAL;
> +
> if (!timeout)
> return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL);
>
> @@ -2512,15 +2525,31 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long 
> __user *, args)
>(int __user *)a[4]);
> break;
> case SYS_SENDMSG:
> +   if (a[2] & MSG_CMSG_COMPAT) {
> +   err = -EINVAL;
> +   break;
> +   }
> err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
> break;
> case SYS_SENDMMSG:
> +   if (a[3] & MSG_CMSG_COMPAT) {
> +   err = -EINVAL;
> +   break;
> +   }
> err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], 
> a[3]);
> break;
> case SYS_RECVMSG:
> +   if (a[2] & MSG_CMSG_COMPAT) {
> +   err = -EINVAL;
> +   break;
> +   }
> err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
> break;
> case SYS_RECVMMSG:
> +   if (a[3] & MSG_CMSG_COMPAT) {
> +   err = -EINVAL;
> +   break;
> +   }
> err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], 
> a[3],
>(struct timespec __user *)a[4]);
> break;
> --
> 1.8.1.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More

RE: [PATCH v2] ARM: mmp: bring up pxa988 with device tree support

2013-06-05 Thread Neil Zhang

Hi Arnd,

> -Original Message-
> From: Arnd Bergmann [mailto:a...@arndb.de]
> Sent: 2013年5月31日 19:25
> To: linux-arm-ker...@lists.infradead.org
> Cc: Neil Zhang; haojian.zhu...@gmail.com; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v2] ARM: mmp: bring up pxa988 with device tree support
> 
> On Friday 31 May 2013 10:58:35 Neil Zhang wrote:
> > bring up pxa988 with device tree support.
> >
> > Change-Id: I6fc869b7d5ff8dc6e4eb0042a89429200f7a9fb1
> 
> Please don't post silly extra headers like that.

Sorry for the noise, will remove it.

> 
> > Signed-off-by: Neil Zhang 
> 
> A couple of comments on the DT structure:
> 
> > +   gic: interrupt-controller@d1dfe100 {
> > +   compatible = "arm,cortex-a9-gic";
> > +   interrupt-controller;
> > +   #interrupt-cells = <3>;
> > +   reg = <0xd1dff000 0x1000>,
> > + <0xd1dfe100 0x0100>;
> > +   };
> > +
> > +   L2: l2-cache-controller@d1dfb000 {
> > +   compatible = "arm,pl310-cache";
> > +   reg = <0xd1dfb000 0x1000>;
> > +   arm,data-latency = <2 1 1>;
> > +   arm,tag-latency = <2 1 1>;
> > +   arm,pwr-dynamic-clk-gating;
> > +   arm,pwr-standby-mode;
> > +   cache-unified;
> > +   cache-level = <2>;
> > +   };
> > +
> > +   local-timer@d1dfe600 {
> > +   compatible = "arm,cortex-a9-twd-timer";
> > +   reg = <0xd1dfe600 0x20>;
> > +   interrupts = <1 13 0x304>;
> > +   };
> 
> Why are these all top-level devices, rather than part of the 'soc' node?

Yes, we can move it as child of soc.

> 
> > +   soc {
> > +   #address-cells = <1>;
> > +   #size-cells = <1>;
> > +   compatible = "simple-bus";
> > +   interrupt-parent = <>;
> > +   ranges;
> > +
> > +   axi@d420 {  /* AXI */
> > +   compatible = "mrvl,axi-bus", "simple-bus";
> > +   #address-cells = <1>;
> > +   #size-cells = <1>;
> > +   reg = <0xd420 0x0020>;
> > +   ranges;
> > +
> > +   intc: wakeupgen@d4282000 {
> > +   compatible = "mrvl,mmp-intc";
> > +   reg = <0xd4282000 0x1000>;
> > +   mrvl,intc-wakeup = <0x114 0x3
> > +   0x144 0x3>;
> > +   };
> > +
> > +   };
> 
> What is a 'mrvl,axi-bus'? Is that different from ARM's AXI bus?
> 
> The documented vendor prefix for Marvell is 'marvell', not 'mrvl', please be
> consistent with that.
> 
> What is the purpose of the 'reg' property in the axi bus? I notice that it
> overlaps with its own children, wich seens very strange.
> Maybe you meant this:
> 
>   axi {
>   ranges = <0xd420 0xd420 0x0020>;
>   ...
>   };
> 
> > +   apb@d400 {  /* APB */
> > +   compatible = "mrvl,apb-bus", "simple-bus";
> > +   #address-cells = <1>;
> > +   #size-cells = <1>;
> > +   reg = <0xd400 0x0020>;
> > +   ranges;
> 
> Same comments apply here.
> 
Thanks for the comments here, will modify it later.

> > diff --git a/arch/arm/mach-mmp/Kconfig b/arch/arm/mach-mmp/Kconfig
> > index ebdda83..0955191 100644
> > --- a/arch/arm/mach-mmp/Kconfig
> > +++ b/arch/arm/mach-mmp/Kconfig
> > @@ -107,6 +107,16 @@ config MACH_MMP2_DT
> >   Include support for Marvell MMP2 based platforms using
> >   the device tree.
> >
> > +config MACH_MMPX_DT
> > +   bool "Support no-PJ/PJ4(ARMv7) platforms from device tree"
> > +   depends on !CPU_MOHAWK && !CPU_PJ4
> > +   select CPU_PXA988
> > +   select USE_OF
> > +   select PINCTRL
> > +   select PINCTRL_SINGLE
> 
> Why would this be mutually exclusive with PJ4? Cortex-A9 and PJ4 are both
> ARMv7 based, so we should be able to have them in the same kernel.

The MACH_MMPX_DT here is for standard ARM core based SoC.
But PJ4 is modified by Marvell, which includes IWMMXT.
 
> 
> > +   help
> > + Include support for Marvell MMP2 based platforms using
> > + the device tree.
> >  endmenu
> 
> You should probably change the help texts to say different things here, e.g.
> list the models supported under these.

Thanks for the remind, will modify it later.

> 
> > diff --git a/arch/arm/mach-mmp/common.c
> b/arch/arm/mach-mmp/common.c
> > index 9292b79..0c621bc 100644
> > --- a/arch/arm/mach-mmp/common.c
> > +++ b/arch/arm/mach-mmp/common.c
> > @@ -11,6 +11,10 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> >
> >  #include 
> >  #include 
> > @@ -36,7 +40,12 @@ static struct map_desc standard_io_desc[]
> __initdata = {
> > .virtual= (unsigned long)AXI_VIRT_BASE,
> > .length = AXI_PHYS_SIZE,
> > .type   = MT_DEVICE,
> > -   },
> > +   }, {
> > +

[PATCH 1/2 v3] SELinux: Reduce overhead of mls_level_isvalid() function call

2013-06-05 Thread Waiman Long

v2->v3:
  - Remove unused local variables i, node from mls_level_isvalid().

v1->v2:
 - Move the new ebitmap comparison logic from mls_level_isvalid()
   into the ebitmap_contains() helper function.
 - Rerun perf and performance tests on the latest v3.10-rc4 kernel.

While running the high_systime workload of the AIM7 benchmark on
a 2-socket 12-core Westmere x86-64 machine running 3.10-rc4 kernel
(with HT on), it was found that a pretty sizable amount of time was
spent in the SELinux code. Below was the perf trace of the "perf
record -a -s" of a test run at 1500 users:

  5.04%ls  [kernel.kallsyms] [k] ebitmap_get_bit
  1.96%ls  [kernel.kallsyms] [k] mls_level_isvalid
  1.95%ls  [kernel.kallsyms] [k] find_next_bit

The ebitmap_get_bit() was the hottest function in the perf-report
output.  Both the ebitmap_get_bit() and find_next_bit() functions
were, in fact, called by mls_level_isvalid(). As a result, the
mls_level_isvalid() call consumed 8.95% of the total CPU time of
all the 24 virtual CPUs which is quite a lot. The majority of the
mls_level_isvalid() function invocations come from the socket creation
system call.

Looking at the mls_level_isvalid() function, it is checking to see
if all the bits set in one of the ebitmap structure are also set in
another one as well as the highest set bit is no bigger than the one
specified by the given policydb data structure. It is doing it in
a bit-by-bit manner. So if the ebitmap structure has many bits set,
the iteration loop will be done many times.

The current code can be rewritten to use a similar algorithm as the
ebitmap_contains() function with an additional check for the
highest set bit. The ebitmap_contains() function was extended to
cover an optional additional check for the highest set bit, and the
mls_level_isvalid() function was modified to call ebitmap_contains().

With that change, the perf trace showed that the used CPU time drop
down to just 0.08% (ebitmap_contains + mls_level_isvalid) of the
total which is about 100X less than before.

  0.07%ls  [kernel.kallsyms] [k] ebitmap_contains
  0.05%ls  [kernel.kallsyms] [k] ebitmap_get_bit
  0.01%ls  [kernel.kallsyms] [k] mls_level_isvalid
  0.01%ls  [kernel.kallsyms] [k] find_next_bit

The remaining ebitmap_get_bit() and find_next_bit() functions calls
are made by other kernel routines as the new mls_level_isvalid()
function will not call them anymore.

This patch also improves the high_systime AIM7 benchmark result,
though the improvement is not as impressive as is suggested by the
reduction in CPU time spent in the ebitmap functions. The table below
shows the performance change on the 2-socket x86-64 system (with HT
on) mentioned above.

+--+---++-+
|   Workload   | mean % change | mean % change  | mean % change   |
|  | 10-100 users  | 200-1000 users | 1100-2000 users |
+--+---++-+
| high_systime | +0.1% | +0.9%  | +2.6%   |
+--+---++-+

Signed-off-by: Waiman Long 
---
 security/selinux/ss/ebitmap.c   |   35 ++-
 security/selinux/ss/ebitmap.h   |2 +-
 security/selinux/ss/mls.c   |   22 +++---
 security/selinux/ss/mls_types.h |2 +-
 4 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/security/selinux/ss/ebitmap.c b/security/selinux/ss/ebitmap.c
index 30f119b..100b3e6 100644
--- a/security/selinux/ss/ebitmap.c
+++ b/security/selinux/ss/ebitmap.c
@@ -213,7 +213,12 @@ netlbl_import_failure:
 }
 #endif /* CONFIG_NETLABEL */
 
-int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2)
+/*
+ * Check to see if all the bits set in e2 are also set in e1. Optionally,
+ * if last_e2bit is non-zero, the highest set bit in e2 cannot exceed
+ * last_e2bit.
+ */
+int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2, u32 last_e2bit)
 {
struct ebitmap_node *n1, *n2;
int i;
@@ -223,6 +228,33 @@ int ebitmap_contains(struct ebitmap *e1, struct ebitmap 
*e2)
 
n1 = e1->node;
n2 = e2->node;
+   if (last_e2bit) {
+   while (n1 && n2 && (n1->startbit <= n2->startbit)) {
+   int lastsetbit = -1;
+
+   if (n1->startbit < n2->startbit) {
+   n1 = n1->next;
+   continue;
+   }
+   for (i = EBITMAP_UNIT_NUMS - 1; i >= 0; i--) {
+   if (!n2->maps[i])
+   continue;
+   if ((n1->maps[i] & n2->maps[i]) != n2->maps[i])
+   return 0;
+   if (lastsetbit < 0)
+   lastsetbit =

Re: [PATCH v2][RFC] tracing/context-tracking: Add preempt_schedule_context() for tracing

2013-06-05 Thread Steven Rostedt

On Wed, 2013-06-05 at 09:41 -0400, Steven Rostedt wrote:
> > 
> > If preempt_enable_notrace() is the only user, why does this live in
> > kernel/context_tracking.c and not in kernel/sched/core.c?
> 
> Then we would need to add #ifdef CONFIG_CONTEXT_TRACKING around it too.
> As we have in preempt.h:
> 
> #ifdef CONFIG_CONTEXT_TRACKING
> 
> void preempt_schedule_context(void);
> 
> #define preempt_check_resched_context() \
> do { \
>   if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
>   preempt_schedule_context(); \
> } while(0)
> #else
> 
> #define preempt_check_resched_context() preempt_check_resched()
> 
> #endif
> 
> Seemed like it was more appropriate to be in context_tracking.c than
> sched/core.c, as it depends on context tracking.
> 

Peter,

Are you fine with this answer, or do you still believe I should put it
into sched/core.c?


> > > +EXPORT_SYMBOL(preempt_schedule_context);
> > 
> > Do we really need to export this?
> 
> As preempt_enable_notrace() is also used by tracepoints, yes. Because
> tracepoints are used by modules. Hmm, now it may be exported via a GPL
> export though.
> 

I switched it to EXPORT_SYMBOL_GPL, so that only GPL modules may use
preempt_enable_notrace().

Are you OK with this?

Here's the new patch:

>From fa5be043fb62472b0ca599d24b22d0f52b83 Mon Sep 17 00:00:00 2001
From: Steven Rostedt 
Date: Fri, 24 May 2013 15:23:40 -0400
Subject: [PATCH] tracing/context-tracking: Add preempt_schedule_context() for
 tracing

Dave Jones hit the following bug report:

 ===
 [ INFO: suspicious RCU usage. ]
 3.10.0-rc2+ #1 Not tainted
 ---
 include/linux/rcupdate.h:771 rcu_read_lock() used illegally while idle!
 other info that might help us debug this:
 RCU used illegally from idle CPU! rcu_scheduler_active = 1, debug_locks = 0
 RCU used illegally from extended quiescent state!
 2 locks held by cc1/63645:
  #0:  (>lock){-.-.-.}, at: [] __schedule+0xed/0x9b0
  #1:  (rcu_read_lock){.+.+..}, at: [] 
cpuacct_charge+0x5/0x1f0

 CPU: 1 PID: 63645 Comm: cc1 Not tainted 3.10.0-rc2+ #1 [loadavg: 40.57 27.55 
13.39 25/277 64369]
 Hardware name: Gigabyte Technology Co., Ltd. GA-MA78GM-S2H/GA-MA78GM-S2H, BIOS 
F12a 04/23/2010
   88010f78fcf8 816ae383 88010f78fd28
  810b698d 88011c092548 0023d073 88011c092500
  0001 88010f78fd60 8109d7c5 8109d645
 Call Trace:
  [] dump_stack+0x19/0x1b
  [] lockdep_rcu_suspicious+0xfd/0x130
  [] cpuacct_charge+0x185/0x1f0
  [] ? cpuacct_charge+0x5/0x1f0
  [] update_curr+0xec/0x240
  [] put_prev_task_fair+0x228/0x480
  [] __schedule+0x161/0x9b0
  [] preempt_schedule+0x51/0x80
  [] ? __cond_resched_softirq+0x60/0x60
  [] ? retint_careful+0x12/0x2e
  [] ftrace_ops_control_func+0x1dc/0x210
  [] ftrace_call+0x5/0x2f
  [] ? retint_careful+0xb/0x2e
  [] ? schedule_user+0x5/0x70
  [] ? schedule_user+0x5/0x70
  [] ? retint_careful+0x12/0x2e
 [ cut here ]

What happened was that the function tracer traced the schedule_user() code
that tells RCU that the system is coming back from userspace, and to
add the CPU back to the RCU monitoring.

Because the function tracer does a preempt_disable/enable_notrace() calls
the preempt_enable_notrace() checks the NEED_RESCHED flag. If it is set,
then preempt_schedule() is called. But this is called before the user_exit()
function can inform the kernel that the CPU is no longer in user mode and
needs to be accounted for by RCU.

The fix is to create a new preempt_schedule_context() that checks if
the kernel is still in user mode and if so to switch it to kernel mode
before calling schedule. It also switches back to user mode coming back
from schedule in need be.

The only user of this currently is the preempt_enable_notrace(), which is
only used by the tracing subsystem.

Link: http://lkml.kernel.org/r/1369423420.6828.226.ca...@gandalf.local.home

Signed-off-by: Steven Rostedt 
---
 include/linux/preempt.h   |   18 +-
 kernel/context_tracking.c |   40 
 2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 87a03c7..f5d4723 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -33,9 +33,25 @@ do { \
preempt_schedule(); \
 } while (0)
 
+#ifdef CONFIG_CONTEXT_TRACKING
+
+void preempt_schedule_context(void);
+
+#define preempt_check_resched_context() \
+do { \
+   if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+   preempt_schedule_context(); \
+} while (0)
+#else
+
+#define preempt_check_resched_context() preempt_check_resched()
+
+#endif /* CONFIG_CONTEXT_TRACKING */
+
 #else /* !CONFIG_PREEMPT */
 
 #define preempt_check_resched()do { } while (0)
+#define preempt_check_resched_context()do { } while (0)
 
 #endif /* CONFIG_PREEMPT */
 
@@ -88,7 +104,7 @@ do { \

[PATCH] doc: avoid strncpy in accounting tool

2013-06-05 Thread Kees Cook

Avoid strncpy anti-pattern. Use strdup() instead, as already done for
the logfile optarg.

Signed-off-by: Kees Cook 
---
Fix for -mm clean-up-scary-strncpydst-src-strlensrc-uses-fix.patch
---
 Documentation/accounting/getdelays.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/accounting/getdelays.c 
b/Documentation/accounting/getdelays.c
index f8ebcde..1db89d3 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -272,7 +272,7 @@ int main(int argc, char *argv[])
char *logfile = NULL;
int loop = 0;
int containerset = 0;
-   char containerpath[1024];
+   char *containerpath = NULL;
int cfd = 0;
int forking = 0;
sigset_t sigset;
@@ -299,7 +299,7 @@ int main(int argc, char *argv[])
break;
case 'C':
containerset = 1;
-   strncpy(containerpath, optarg, strlen(optarg) + 1);
+   containerpath = strdup(optarg);
break;
case 'w':
logfile = strdup(optarg);
-- 
1.7.9.5


-- 
Kees Cook
Chrome OS Security
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH cgroup/for-3.11] cgroup: disallow cpuacct if sane_behavior

2013-06-05 Thread Tejun Heo

cpuacct causes duplicate accountings on the scheduler and cpu will
provide equivalent stats.  Optimizations for cases where cpu and
cpuacct are co-mounted are being worked on but we want to deprecate it
eventually.  Let's disallow cpuacct if __DEVEL__sane_behavior.

Signed-off-by: Tejun Heo 
Cc: Glauber Costa 
Cc: Peter Zijlstra 
---
 include/linux/cgroup.h |3 +++
 kernel/cgroup.c|5 +
 2 files changed, 8 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d0ad379..6c3bbdb 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -280,6 +280,9 @@ enum {
 * - memcg: use_hierarchy is on by default and the cgroup file for
 *   the flag is not created.
 *
+* - cpuacct: No longer supported.  cpu will have matching stats.
+*   Use those instead.
+*
 * The followings are planned changes.
 *
 * - release_agent will be disallowed once replacement notification
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bc53d50..5c746eb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1256,6 +1256,11 @@ static int parse_cgroupfs_options(char *data, struct 
cgroup_sb_opts *opts)
pr_err("cgroup: sane_behavior: clone_children is not 
allowed\n");
return -EINVAL;
}
+
+   if (test_bit(cpuacct_subsys_id, >subsys_mask)) {
+   pr_err("cgroup: sane_behavior: cpuacct is no longer 
available, use stats from cpu instead\n");
+   return -EINVAL;
+   }
}
 
/*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 3/3] sched: remove the useless declaration in kernel/sched/fair.c

2013-06-05 Thread Michael Wang

v2:
re-based on latest tip/master

default_cfs_period(), do_sched_cfs_period_timer(), do_sched_cfs_slack_timer()
already defined previously, no need to declare again.

CC: Ingo Molnar 
CC: Peter Zijlstra 
Signed-off-by: Michael Wang 
---
 kernel/sched/fair.c |4 
 1 files changed, 0 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0cea941..9efc50f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2618,10 +2618,6 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
throttle_cfs_rq(cfs_rq);
 }
 
-static inline u64 default_cfs_period(void);
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
-static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
-
 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
 {
struct cfs_bandwidth *cfs_b =
-- 
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] corner cases of open() on procfs symlinks

2013-06-05 Thread Linus Torvalds

On Thu, Jun 6, 2013 at 11:29 AM, Al Viro  wrote:
>
> Probably...  procfs symlinks neutering O_DIRECTORY might, in theory, be usable
> to cook something nasty, but I don't see any obvious ways to exploit that.
> FWIW, resulting kernel seems to survive the minimal beating, but obviously
> more is needed.

Let's plan on merging that patch in the 3.11 merge window, and perhaps
mark it for stable?

 Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 2/3] sched: code refine in unthrottle_cfs_rq()

2013-06-05 Thread Michael Wang

v2:
re-based on latest tip/master

Directly use rq to save some code.

CC: Ingo Molnar 
CC: Peter Zijlstra 
Signed-off-by: Michael Wang 
---
 kernel/sched/fair.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 143dcdb..0cea941 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2275,7 +2275,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct sched_entity *se;
long task_delta, dequeue = 1;
 
-   se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+   se = cfs_rq->tg->se[cpu_of(rq)];
 
/* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
-- 
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 1/2] dma: mmp_pdma: add protect when alloc/free phy channels

2013-06-05 Thread Xiang Wang

From: Xiang Wang 

In mmp pdma, phy channels are allocated/freed dynamically
and frequently. But no proper protection is added.
Conflict will happen when multi-users are requesting phy
channels at the same time. Use spinlock to protect.

Signed-off-by: Xiang Wang 
---
 drivers/dma/mmp_pdma.c |   42 ++
 1 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/drivers/dma/mmp_pdma.c b/drivers/dma/mmp_pdma.c
index c26699f..84e51a1 100644
--- a/drivers/dma/mmp_pdma.c
+++ b/drivers/dma/mmp_pdma.c
@@ -121,6 +121,7 @@ struct mmp_pdma_device {
struct device   *dev;
struct dma_device   device;
struct mmp_pdma_phy *phy;
+   spinlock_t phy_lock; /* protect alloc/free phy channels */
 };
 
 #define tx_to_mmp_pdma_desc(tx) container_of(tx, struct mmp_pdma_desc_sw, 
async_tx)
@@ -219,6 +220,7 @@ static struct mmp_pdma_phy *lookup_phy(struct mmp_pdma_chan 
*pchan)
int prio, i;
struct mmp_pdma_device *pdev = to_mmp_pdma_dev(pchan->chan.device);
struct mmp_pdma_phy *phy;
+   unsigned long flags;
 
/*
 * dma channel priorities
@@ -227,6 +229,8 @@ static struct mmp_pdma_phy *lookup_phy(struct mmp_pdma_chan 
*pchan)
 * ch 8 - 11, 24 - 27  <--> (2)
 * ch 12 - 15, 28 - 31  <--> (3)
 */
+
+   spin_lock_irqsave(>phy_lock, flags);
for (prio = 0; prio <= (((pdev->dma_channels - 1) & 0xf) >> 2); prio++) 
{
for (i = 0; i < pdev->dma_channels; i++) {
if (prio != ((i & 0xf) >> 2))
@@ -234,14 +238,30 @@ static struct mmp_pdma_phy *lookup_phy(struct 
mmp_pdma_chan *pchan)
phy = >phy[i];
if (!phy->vchan) {
phy->vchan = pchan;
+   spin_unlock_irqrestore(>phy_lock, flags);
return phy;
}
}
}
 
+   spin_unlock_irqrestore(>phy_lock, flags);
return NULL;
 }
 
+static void free_phy(struct mmp_pdma_chan *pchan)
+{
+   struct mmp_pdma_device *pdev = to_mmp_pdma_dev(pchan->chan.device);
+   unsigned long flags;
+
+   if (!pchan->phy)
+   return;
+
+   spin_lock_irqsave(>phy_lock, flags);
+   pchan->phy->vchan = NULL;
+   pchan->phy = NULL;
+   spin_unlock_irqrestore(>phy_lock, flags);
+}
+
 /* desc->tx_list ==> pending list */
 static void append_pending_queue(struct mmp_pdma_chan *chan,
struct mmp_pdma_desc_sw *desc)
@@ -277,10 +297,7 @@ static void start_pending_queue(struct mmp_pdma_chan *chan)
 
if (list_empty(>chain_pending)) {
/* chance to re-fetch phy channel with higher prio */
-   if (chan->phy) {
-   chan->phy->vchan = NULL;
-   chan->phy = NULL;
-   }
+   free_phy(chan);
dev_dbg(chan->dev, "no pending list\n");
return;
}
@@ -377,10 +394,7 @@ static int mmp_pdma_alloc_chan_resources(struct dma_chan 
*dchan)
dev_err(chan->dev, "unable to allocate descriptor pool\n");
return -ENOMEM;
}
-   if (chan->phy) {
-   chan->phy->vchan = NULL;
-   chan->phy = NULL;
-   }
+   free_phy(chan);
chan->idle = true;
chan->dev_addr = 0;
return 1;
@@ -411,10 +425,7 @@ static void mmp_pdma_free_chan_resources(struct dma_chan 
*dchan)
chan->desc_pool = NULL;
chan->idle = true;
chan->dev_addr = 0;
-   if (chan->phy) {
-   chan->phy->vchan = NULL;
-   chan->phy = NULL;
-   }
+   free_phy(chan);
return;
 }
 
@@ -581,10 +592,7 @@ static int mmp_pdma_control(struct dma_chan *dchan, enum 
dma_ctrl_cmd cmd,
switch (cmd) {
case DMA_TERMINATE_ALL:
disable_chan(chan->phy);
-   if (chan->phy) {
-   chan->phy->vchan = NULL;
-   chan->phy = NULL;
-   }
+   free_phy(chan);
spin_lock_irqsave(>desc_lock, flags);
mmp_pdma_free_desc_list(chan, >chain_pending);
mmp_pdma_free_desc_list(chan, >chain_running);
@@ -777,6 +785,8 @@ static int mmp_pdma_probe(struct platform_device *op)
return -ENOMEM;
pdev->dev = >dev;
 
+   spin_lock_init(>phy_lock);
+
iores = platform_get_resource(op, IORESOURCE_MEM, 0);
if (!iores)
return -EINVAL;
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] corner cases of open() on procfs symlinks

2013-06-05 Thread Al Viro

On Thu, Jun 06, 2013 at 10:38:31AM +0900, Linus Torvalds wrote:
> On Thu, Jun 6, 2013 at 10:20 AM, Al Viro  wrote:
> > I'm not sure whether to treat that as a bug or as a weird misfeature
> > enshrined in userland ABI:
> > open("/tmp", O_CREAT, 0) => -EISDIR // LAST_NORM case
> > open("/", O_CREAT, 0) => -EISDIR// LAST_ROOT
> > open(".", O_CREAT, 0) => -EISDIR// LAST_DOT
> > open("..", O_CREAT, 0) => -EISDIR   // LAST_DOTDOT
> > open("/proc/self/cwd", O_CREAT, 0) => success   // LAST_BIND
> > open("/proc/self/cwd/", O_CREAT, 0) => -EISDIR  // trailing slashes
> 
> Ok, that looks buggy. O_CREAT should definitely return EISDIR for
> /proc/self/cwd too, since it's a directory. I don't think the
> O_RDWR/O_WRONLY thing should matter.
> 
> >I would obviously
> > like to do that - do_last() is far too convoluted as it is; the only
> > question is whether we can change the first weirdness...  Comments?
> 
> Exactly which cases does that change? I have no objections if it's
> only the "LAST_BIND" case that now starts returning EISDIR. Is there
> anything else it affects?

LAST_BIND gets to go through the EISDIR and ENOTDIR checks that way, which
fixes these two bugs.

LAST_DOT/LAST_DOTDOT/LAST_ROOT end up checking whether we are at the
directory or not; sure, we know that we are, so these tests are
redundant, but I really don't think it's worth optimizing for.  We are
not generating any data misses and arguably we reduce instruction cache
footprint a bit, not that it would be noticable with the I$ horror
do_last() still is...

What really happens in that switch is that do_last() tries to be too smart
and ends up skipping a few things too many.

> That said, obviously if something breaks, we'd have to revert it, and
> as a cleanup rather than some serious bug (ie this doesn't cause
> crashes or security issues), I suspect this should wait until 3.11
> regardless. No?

Probably...  procfs symlinks neutering O_DIRECTORY might, in theory, be usable
to cook something nasty, but I don't see any obvious ways to exploit that.
FWIW, resulting kernel seems to survive the minimal beating, but obviously
more is needed.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH RFC v2 1/3] drivers/platform/x86: add cpu physically hotplug driver

2013-06-05 Thread Gu Zheng

On 06/06/2013 09:40 AM, liguang wrote:

> this driver will support cpu phyical add/removal automatically
> after online/offline. if cpu hotpluged, cpu will not
> online automatically, and for cpu offline, we try to
> do actually eject if allowed for cpu like
> "echo 1 > /sys/bus/acpi/devices/LNXCPU\:0X/eject"
> this "echo ..." is only present for recent kernel
> (sorry, can't figure out since when), for a little
> older kernel, there's not such approach AFAICS.
> 
> Signed-off-by: liguang 
> ---
>  drivers/platform/x86/Kconfig  |8 
>  drivers/platform/x86/Makefile |1 +
>  drivers/platform/x86/cpu_physic_hotplug.c |   60 
> +
>  3 files changed, 69 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/platform/x86/cpu_physic_hotplug.c
> 
> diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
> index 8577261..39b2392 100644
> --- a/drivers/platform/x86/Kconfig
> +++ b/drivers/platform/x86/Kconfig
> @@ -789,4 +789,12 @@ config PVPANIC
> a paravirtualized device provided by QEMU; it lets a virtual machine
> (guest) communicate panic events to the host.
>  
> +config QEMU_CPU_PHYSIC_HOTPLUG
> + tristate "physically add/remove cpu after cpu onlined/offlined"
> + depends on ACPI_HOTPLUG_CPU
> + ---help---
> +   This driver will support physically remove a cpu after
> +   it offlined for QEMU automatically. someone may require this feature
> +   to do a physically removal for a cpu.
> +
>  endif # X86_PLATFORM_DEVICES
> diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
> index ef0ec74..2e669b0 100644
> --- a/drivers/platform/x86/Makefile
> +++ b/drivers/platform/x86/Makefile
> @@ -53,3 +53,4 @@ obj-$(CONFIG_APPLE_GMUX)+= apple-gmux.o
>  obj-$(CONFIG_CHROMEOS_LAPTOP)+= chromeos_laptop.o
>  
>  obj-$(CONFIG_PVPANIC)   += pvpanic.o
> +obj-$(CONFIG_QEMU_CPU_PHYSIC_HOTPLUG)+= cpu_physic_hotplug.o
> diff --git a/drivers/platform/x86/cpu_physic_hotplug.c 
> b/drivers/platform/x86/cpu_physic_hotplug.c
> new file mode 100644
> index 000..a52c042
> --- /dev/null
> +++ b/drivers/platform/x86/cpu_physic_hotplug.c
> @@ -0,0 +1,60 @@
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +MODULE_AUTHOR("Li Guang");
> +MODULE_DESCRIPTION("CPU physically hot-plug/unplug Driver");
> +MODULE_LICENSE("GPL");
> +
> +static int cpu_logic_hotplug_notify(struct notifier_block *nfb,
> + unsigned long action, void *hcpu)
> +{
> + unsigned int cpu = (unsigned long)hcpu;
> + struct acpi_processor *pr = per_cpu(processors, cpu);
> +
> + if (pr) {
> + switch (action) {
> + case CPU_ONLINE:
> + break;
> + case CPU_DEAD:
> + break;
> + default:
> + break;
> + }
> + }
> + return NOTIFY_OK;
> +}
> +
> +static struct notifier_block cpu_logic_hotplug_notifier =
> +{
> + .notifier_call = cpu_logic_hotplug_notify,
> +};
> +
> +static int cpu_physic_hotplug_notify(struct notifier_block *nfb,
> +  unsigned char *s)
> +{
> +}

Hi guang,
Maybe you need to define the callback function in the right format at 
the beginning,
if so, no need to correct it later.:)

Thanks,
Gu


> +
> +static struct notifier_block cpu_physic_hotplug_notifier =
> +{
> + .notifier_call = cpu_physic_hotplug_notify,
> +};
> +
> +static int __init cpu_qemu_hotplug_init(void)
> +{
> + register_hotcpu_notifier(_logic_hotplug_notifier);
> + register_ec_gpe_notifier(_physic_hotplug_notifier);


As the [PATCH 2/3] has no dependence on this one, so you can set [PATCH 2/3] to 
[PATCH 1/3] and this one
to [PATCH 2/3]. Then you can use the xxx_ec_space_notifier directly here.

> + return 0;
> +}
> +
> +static void __exit cpu_qemu_hotplug_exit(void)
> +{
> + unregister_hotcpu_notifier(_logic_hotplug_notifier);
> + unregister_ec_gpe_notifier(_physic_hotplug_notifier);
> +}
> +
> +module_init(cpu_qemu_hotplug_init);
> +module_exit(cpu_qemu_hotplug_exit);


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 2/2] dma: mmp_pdma: clear DRCMR when free a phy channel

2013-06-05 Thread Xiang Wang

From: Xiang Wang 

In mmp pdma, phy channels are allocated/freed dynamically.
The mapping from DMA request to DMA channel number in DRCMR
should be cleared when a phy channel is freed. Otherwise
conflicts will happen when:
1. A is using channel 2 and free it after finished, but A
still maps to channel 2 in DRCMR of A.
2. Now another one B gets channel 2. So B maps to channel 2
too in DRCMR of B.
In the datasheet, it is described that "Do not map two active
requests to the same channel since it produces unpredictable
results" and we can observe that during test.

Signed-off-by: Xiang Wang 
---
 drivers/dma/mmp_pdma.c |6 ++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/mmp_pdma.c b/drivers/dma/mmp_pdma.c
index 84e51a1..410dc26 100644
--- a/drivers/dma/mmp_pdma.c
+++ b/drivers/dma/mmp_pdma.c
@@ -252,10 +252,16 @@ static void free_phy(struct mmp_pdma_chan *pchan)
 {
struct mmp_pdma_device *pdev = to_mmp_pdma_dev(pchan->chan.device);
unsigned long flags;
+   u32 reg;
 
if (!pchan->phy)
return;
 
+   /* clear the channel mapping in DRCMR */
+   reg = pchan->phy->vchan->drcmr;
+   reg = ((reg < 64) ? 0x0100 : 0x1100) + ((reg & 0x3f) << 2);
+   writel(0, pchan->phy->base + reg);
+
spin_lock_irqsave(>phy_lock, flags);
pchan->phy->vchan = NULL;
pchan->phy = NULL;
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Modify UEFI anti-bricking code

2013-06-05 Thread joeyli

於 三，2013-06-05 於 16:08 +，Matthew Garrett 提到：
> On Wed, 2013-06-05 at 16:59 +0100, Matt Fleming wrote:
> 
> > +   /* clean DUMMY object */
> > +   efi.set_variable(efi_dummy_name, _DUMMY_GUID, 0, 0, NULL);
> 
> Hm. Actually, is that going to work? From the spec:
> 

The patch I tested on OVMF, it can delete DUMMY object when system boot.

> If a preexisting variable is rewritten with different attributes,
> SetVariable()shall not modify the variable and shall return
> EFI_INVALID_PARAMETER. 
> 
> So I think we probably need to fix the attributes to NV|RT|BS for both
> this call and the one in query_variable_store. We should probably also
> only do the workaround if the NV bit is set in the original query.
> 
> -- 
> Matthew Garrett | mj...@srcf.ucam.org

Yes, I think that more safe for fix the attributes.


Thanks a lot!
Joey Lee



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 0/2] dma: mmp_pdma: Fix phy channels not protected issue

2013-06-05 Thread Xiang Wang

From: Xiang Wang 

This patch set deals with the issues that 1) phy channels are not protected
in mmp_pdma. 2) dma request<->channel mapping is not cleared when a phy chan
is freed.

Xiang Wang (2):
  dma: mmp_pdma: add protect when alloc/free phy channels
  dma: mmp_pdma: clear DRCMR when free a phy channel

 drivers/dma/mmp_pdma.c |   48 
 1 files changed, 32 insertions(+), 16 deletions(-)

-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/3] sched: code refine in unthrottle_cfs_rq()

2013-06-05 Thread Michael Wang

On 06/05/2013 07:15 PM, Peter Zijlstra wrote:
> On Tue, Jun 04, 2013 at 02:23:39PM +0800, Michael Wang wrote:
>> Directly use rq to save some code.
>>
>> CC: Ingo Molnar 
>> CC: Peter Zijlstra 
>> Signed-off-by: Michael Wang 
> 
> Please send patches against tip/master; the below didn't apply cleanly.
> It was a trivial conflict so I applied force and made it fit.

My sincere apologies on that, please allow me to resend the accepted
patches based on latest tip/master, forgive me for create extra work
like that...

Regards,
Michael Wang

> 
> Thanks!
> 
>> ---
>>  kernel/sched/fair.c |2 +-
>>  1 files changed, 1 insertions(+), 1 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index c61a614..1e10911 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -2298,7 +2298,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>>  int enqueue = 1;
>>  long task_delta;
>>  
>> -se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
>> +se = cfs_rq->tg->se[cpu_of(rq)];
>>  
>>  cfs_rq->throttled = 0;
>>  raw_spin_lock(_b->lock);
>> -- 
>> 1.7.4.1
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 1/3] sched: don't repeat the initialization in sched_init()

2013-06-05 Thread Michael Wang

On 06/05/2013 07:06 PM, Peter Zijlstra wrote:
> On Wed, Jun 05, 2013 at 10:24:18AM +0800, Michael Wang wrote:
>> v2:
>>  Move comments back before init_tg_cfs_entry(). (Thanks for the notify 
>> from pjt)
>>
>> In sched_init(), there is no need to initialize 'root_task_group.shares' and
>> 'root_task_group.cfs_bandwidth' repeatedly.
>>
>> CC: Paul Tuner 
>> CC: Ingo Molnar 
>> CC: Peter Zijlstra 
>> Signed-off-by: Michael Wang 
>> ---
>>  kernel/sched/core.c |7 +--
>>  1 files changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 58453b8..96f69da 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -6955,6 +6955,11 @@ void __init sched_init(void)
>>  
>>  #endif /* CONFIG_CGROUP_SCHED */
>>  
>> +#ifdef CONFIG_FAIR_GROUP_SCHED
>> +root_task_group.shares = ROOT_TASK_GROUP_LOAD;
>> +init_cfs_bandwidth(_task_group.cfs_bandwidth);
>> +#endif
>> +
>>  for_each_possible_cpu(i) {
>>  struct rq *rq;
>>  
>> @@ -6966,7 +6971,6 @@ void __init sched_init(void)
>>  init_cfs_rq(>cfs);
>>  init_rt_rq(>rt, rq);
>>  #ifdef CONFIG_FAIR_GROUP_SCHED
>> -root_task_group.shares = ROOT_TASK_GROUP_LOAD;
>>  INIT_LIST_HEAD(>leaf_cfs_rq_list);
>>  /*
>>   * How much cpu bandwidth does root_task_group get?
>> @@ -6987,7 +6991,6 @@ void __init sched_init(void)
>>   * We achieve this by letting root_task_group's tasks sit
>>   * directly in rq->cfs (i.e root_task_group->se[] = NULL).
>>   */
>> -init_cfs_bandwidth(_task_group.cfs_bandwidth);
>>  init_tg_cfs_entry(_task_group, >cfs, NULL, i, NULL);
>>  #endif /* CONFIG_FAIR_GROUP_SCHED */
> 
> I would actually like a patch reducing the #ifdef forest there, not
> adding to it.

I see :)

> 
> There's no actual harm in doing the initialization mutliple times,
> right?

Yeah, it's safe to redo the init, cost some cycles but not so expensive.

Regards,
Michael Wang

> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] mtd: cfi_cmdset_0002: increase do_write_buffer() timeout

2013-06-05 Thread Huang Shijie

于 2013年06月06日 05:08, Brian Norris 写道:
> Note that a 2-jiffy timeout does not, in fact, totally resolve my
> problems; with a timeout of 2 jiffies, I still get a timeout that
> (according to getnstimeofday()) occurs after only 56us. It does
since the 2-jiffy does not resolve your problem, i suggest you try the
latest linux-next
tree.


thanks
Huang Shijie

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: mmotm 2013-06-05-17-24 uploaded

2013-06-05 Thread Stephen Rothwell

Hi Andrew,

On Wed, 05 Jun 2013 17:26:36 -0700 a...@linux-foundation.org wrote:
>
>   linux-next-git-rejects.patch

We must figure out why you sometimes get rejects that I do not get when I
import your series into a git tree.  However in this case you resolution
is not quite right.  It leaves 2 continue statements in
net/mac80211/iface.c at line 191 which will unconditionally short circuit
the enclosing loop.  The version that will be in linux-next today is
correct (and git did it automatically as part of the merge of the old
linux-next tree).

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au

pgp6hFfBMo28T.pgp
Description: PGP signature

[RFC] ext4: simplify the code a bit

2013-06-05 Thread Stephen Rothwell

Hi guys,

I noticed the following warning in a linux-next build:

fs/ext4/inode.c: In function 'ext4_da_writepages':
fs/ext4/inode.c:2212:6: warning: 'err' may be used uninitialized in this 
function [-Wmaybe-uninitialized]

In tracking this down, I followed the call chains and discovered that
io_submit_init_bio() only ever returned 0.  Switching that to return
void and following back up the chain lead to the following patch.  This
makes it far more obvious that by the end of the loop in
mpage_map_and_submit_extent(), err must be zero, which allows the final
removal of err2 and the check that caused the above warning.

This does remove the above warning and simplifies the code a bit, but may
be removing error infrastructure that may be used in the future.

Signed-off-by: Stephen Rothwell 
---
 fs/ext4/ext4.h|  8 
 fs/ext4/inode.c   | 46 +-
 fs/ext4/page-io.c | 35 ++-
 3 files changed, 23 insertions(+), 66 deletions(-)

This is against today's ext4 tree dev branch.

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bd9890f..1341452 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2660,10 +2660,10 @@ extern void ext4_io_submit_init(struct ext4_io_submit 
*io,
 extern void ext4_end_io_rsv_work(struct work_struct *work);
 extern void ext4_end_io_unrsv_work(struct work_struct *work);
 extern void ext4_io_submit(struct ext4_io_submit *io);
-extern int ext4_bio_write_page(struct ext4_io_submit *io,
-  struct page *page,
-  int len,
-  struct writeback_control *wbc);
+extern void ext4_bio_write_page(struct ext4_io_submit *io,
+   struct page *page,
+   int len,
+   struct writeback_control *wbc);
 
 /* mmp.c */
 extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 442c5d2..80bc416 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1832,7 +1832,6 @@ out:
 static int ext4_writepage(struct page *page,
  struct writeback_control *wbc)
 {
-   int ret = 0;
loff_t size;
unsigned int len;
struct buffer_head *page_bufs = NULL;
@@ -1884,11 +1883,11 @@ static int ext4_writepage(struct page *page,
unlock_page(page);
return -ENOMEM;
}
-   ret = ext4_bio_write_page(_submit, page, len, wbc);
+   ext4_bio_write_page(_submit, page, len, wbc);
ext4_io_submit(_submit);
/* Drop io_end reference we got from init */
ext4_put_io_end_defer(io_submit.io_end);
-   return ret;
+   return 0;
 }
 
 #define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
@@ -1963,11 +1962,10 @@ static bool add_page_bufs_to_extent(struct 
mpage_da_data *mpd,
return true;
 }
 
-static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+static void mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
 {
int len;
loff_t size = i_size_read(mpd->inode);
-   int err;
 
BUG_ON(page->index != mpd->first_page);
if (page->index == size >> PAGE_CACHE_SHIFT)
@@ -1975,12 +1973,9 @@ static int mpage_submit_page(struct mpage_da_data *mpd, 
struct page *page)
else
len = PAGE_CACHE_SIZE;
clear_page_dirty_for_io(page);
-   err = ext4_bio_write_page(>io_submit, page, len, mpd->wbc);
-   if (!err)
-   mpd->wbc->nr_to_write--;
+   ext4_bio_write_page(>io_submit, page, len, mpd->wbc);
+   mpd->wbc->nr_to_write--;
mpd->first_page++;
-
-   return err;
 }
 
 /*
@@ -1997,7 +1992,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, 
struct page *page)
  * mapped, we update @map to the next extent in the last page that needs
  * mapping. Otherwise we submit the page for IO.
  */
-static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+static void mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 {
struct pagevec pvec;
int nr_pages, i;
@@ -2009,7 +2004,6 @@ static int mpage_map_and_submit_buffers(struct 
mpage_da_data *mpd)
pgoff_t start, end;
ext4_lblk_t lblk;
sector_t pblock;
-   int err;
 
start = mpd->map.m_lblk >> bpp_bits;
end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
@@ -2043,7 +2037,7 @@ static int mpage_map_and_submit_buffers(struct 
mpage_da_data *mpd)
add_page_bufs_to_extent(mpd, head, bh,
lblk);
pagevec_release();
-   return 0;
+   return;
}
if (buffer_delay(bh)) {

[PATCH v3.9 stable] PCI: acpiphp: Re-enumerate devices when host bridge receives Bus Check

2013-06-05 Thread Yinghai Lu

When a PCI host bridge device receives a Bus Check notification, we
must re-enumerate starting with the bridge to discover changes (devices
that have been added or removed).

Prior to 668192b678 ("PCI: acpiphp: Move host bridge hotplug to
pci_root.c"), this happened in _handle_hotplug_event_bridge().  After that
commit, _handle_hotplug_event_bridge() is not installed for host bridges,
and the host bridge notify handler, _handle_hotplug_event_root() did not
re-enumerate.

This patch adds re-enumeration to _handle_hotplug_event_root().

This fixes cases where we don't notice the addition or removal of
PCI devices, e.g., the PCI-to-USB ExpressCard in the bugzilla below.

-v1: Backport of 3f327e39b4 to v3.9 by Bjorn Helgaas 
-v2: use request_module("acpiphp") for acpiphp is as module instead of built-in
 by Yinghai.

[bhelgaas: changelog, references]
Reference: 
https://lkml.kernel.org/r/CAAh6nkmbKR3HTqm5ommevsBwhL_u0N8Rk7Wsms_LfP=nbgk...@mail.gmail.com
Reference: https://bugzilla.kernel.org/show_bug.cgi?id=57961
Reported-by: Gavin Guo 
Tested-by: Gavin Guo 
Signed-off-by: Yinghai Lu 
Signed-off-by: Bjorn Helgaas 
Acked-by: Rafael J. Wysocki 
CC: sta...@vger.kernel.org  # v3.9+
---
 drivers/acpi/pci_root.c|7 ++-
 drivers/pci/hotplug/acpiphp_glue.c |   14 ++
 include/linux/pci-acpi.h   |2 ++
 3 files changed, 22 insertions(+), 1 deletion(-)

Index: linux-3.9.4/drivers/acpi/pci_root.c
===
--- linux-3.9.4.orig/drivers/acpi/pci_root.c
+++ linux-3.9.4/drivers/acpi/pci_root.c
@@ -665,6 +665,7 @@ static void handle_root_bridge_removal(s
kfree(ej_event);
 }
 
+void (*acpiphp_check_host_bridge)(acpi_handle handle);
 static void _handle_hotplug_event_root(struct work_struct *work)
 {
struct acpi_pci_root *root;
@@ -687,7 +688,11 @@ static void _handle_hotplug_event_root(s
/* bus enumerate */
printk(KERN_DEBUG "%s: Bus check notify on %s\n", __func__,
 (char *)buffer.pointer);
-   if (!root)
+   if (root) {
+   request_module("acpiphp");
+   if (acpiphp_check_host_bridge)
+   acpiphp_check_host_bridge(handle);
+   } else
handle_root_bridge_insertion(handle);
 
break;
Index: linux-3.9.4/drivers/pci/hotplug/acpiphp_glue.c
===
--- linux-3.9.4.orig/drivers/pci/hotplug/acpiphp_glue.c
+++ linux-3.9.4/drivers/pci/hotplug/acpiphp_glue.c
@@ -1122,6 +1122,18 @@ check_sub_bridges(acpi_handle handle, u3
return AE_OK ;
 }
 
+static void __acpiphp_check_host_bridge(acpi_handle handle)
+{
+   struct acpiphp_bridge *bridge;
+
+   bridge = acpiphp_handle_to_bridge(handle);
+   if (bridge)
+   acpiphp_check_bridge(bridge);
+
+   acpi_walk_namespace(ACPI_TYPE_DEVICE, handle,
+   ACPI_UINT32_MAX, check_sub_bridges, NULL, NULL, NULL);
+}
+
 static void _handle_hotplug_event_bridge(struct work_struct *work)
 {
struct acpiphp_bridge *bridge;
@@ -1305,6 +1317,7 @@ static struct acpi_pci_driver acpi_pci_h
 int __init acpiphp_glue_init(void)
 {
acpi_pci_register_driver(_pci_hp_driver);
+   acpiphp_check_host_bridge = __acpiphp_check_host_bridge;
 
return 0;
 }
@@ -1317,6 +1330,7 @@ int __init acpiphp_glue_init(void)
  */
 void  acpiphp_glue_exit(void)
 {
+   acpiphp_check_host_bridge = NULL;
acpi_pci_unregister_driver(_pci_hp_driver);
 }
 
Index: linux-3.9.4/include/linux/pci-acpi.h
===
--- linux-3.9.4.orig/include/linux/pci-acpi.h
+++ linux-3.9.4/include/linux/pci-acpi.h
@@ -43,6 +43,8 @@ static inline acpi_handle acpi_pci_get_b
 }
 #endif
 
+extern void (*acpiphp_check_host_bridge)(acpi_handle handle);
+
 #ifdef CONFIG_ACPI_APEI
 extern bool aer_acpi_firmware_first(void);
 #else
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFC v2 0/3] add cpu physically hotplug driver

2013-06-05 Thread liguang

This patch-set try to support physically hot-plug/unplug
a cpu automatically, that is:
if you offline a cpu, it will automatically actually remove
a cpu, and if you hot-plug a cpu, then it will online this
cpu automatically.
so, offline is just like eject, but eject attribute seems not
available since recent kernel(can't figure out when), with
this driver, if allowed, it will trigger a eject cpu process.
and for automatically online, it was said there are objections,
don't know the reason, so, send this patch-set boldly.

of course, this approach is for QEMU 's hotplug cpu emulation 
only, but not limited, if someone like to explore ec space to
implment cpu hot-plug/unplug for real platform please
feel free to continue.

Li Guang (3)
 drivers/platform/x86: add cpu physically hotplug driver
 ec: add ec space notifier
 cpu_physic_hotplug: register handler for ec space notifier

drivers/acpi/ec.c | 32 
drivers/platform/x86/Kconfig  |  8 
drivers/platform/x86/Makefile |  1 +
drivers/platform/x86/cpu_physic_hotplug.c | 90 +
include/linux/acpi.h  |  2 ++
5 files changed, 130 insertions(+), 3 deletions(-)
 create mode 100644 drivers/platform/x86/cpu_physic_hotplug.c
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFC v2 2/3] ec: add ec space notifier

2013-06-05 Thread liguang

add a notifier for anyone who are instresting in
ec space changing.

Signed-off-by: liguang 
---
 drivers/acpi/ec.c|   32 
 include/linux/acpi.h |2 ++
 2 files changed, 34 insertions(+), 0 deletions(-)

diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index edc0081..dee3417 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -124,6 +124,35 @@ static int EC_FLAGS_MSI; /* Out-of-spec MSI controller */
 static int EC_FLAGS_VALIDATE_ECDT; /* ASUStec ECDTs need to be validated */
 static int EC_FLAGS_SKIP_DSDT_SCAN; /* Not all BIOS survive early DSDT scan */
 
+/* notifier chain for who are intresting in ec space changing */
+static RAW_NOTIFIER_HEAD(ec_space_chain);
+
+int __ref register_ec_space_notifier(struct notifier_block *nb)
+{
+int ret;
+
+ret = raw_notifier_chain_register(_space_chain, nb);
+
+return ret;
+}
+EXPORT_SYMBOL(register_ec_space_notifier);
+
+void __ref unregister_ec_space_notifier(struct notifier_block *nb)
+{
+
+raw_notifier_chain_unregister(_space_chain, nb);
+}
+EXPORT_SYMBOL(unregister_ec_space_notifier);
+
+static int ec_space_notify(void *data)
+{
+int ret;
+
+ret = __raw_notifier_call_chain(_space_chain, 0, data, -1, NULL);
+
+ return notifier_to_errno(ret);
+}
+
 /* --
  Transaction Management
-- 
*/
@@ -638,6 +667,9 @@ static u32 acpi_ec_gpe_handler(acpi_handle gpe_device,
wake_up(>wait);
ec_check_sci(ec, acpi_ec_read_status(ec));
}
+
+   ec_space_notify(data);
+
return ACPI_INTERRUPT_HANDLED | ACPI_REENABLE_GPE;
 }
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 17b5b59..4fe2247 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -158,6 +158,8 @@ extern int ec_transaction(u8 command,
   const u8 *wdata, unsigned wdata_len,
   u8 *rdata, unsigned rdata_len);
 extern acpi_handle ec_get_handle(void);
+extern int register_ec_space_notifier(struct notifier_block *nb);
+extern void unregister_ec_space_notifier(struct notifier_block *nb);
 
 #if defined(CONFIG_ACPI_WMI) || defined(CONFIG_ACPI_WMI_MODULE)
 
-- 
1.7.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFC v2 1/3] drivers/platform/x86: add cpu physically hotplug driver

2013-06-05 Thread liguang

this driver will support cpu phyical add/removal automatically
after online/offline. if cpu hotpluged, cpu will not
online automatically, and for cpu offline, we try to
do actually eject if allowed for cpu like
"echo 1 > /sys/bus/acpi/devices/LNXCPU\:0X/eject"
this "echo ..." is only present for recent kernel
(sorry, can't figure out since when), for a little
older kernel, there's not such approach AFAICS.

Signed-off-by: liguang 
---
 drivers/platform/x86/Kconfig  |8 
 drivers/platform/x86/Makefile |1 +
 drivers/platform/x86/cpu_physic_hotplug.c |   60 +
 3 files changed, 69 insertions(+), 0 deletions(-)
 create mode 100644 drivers/platform/x86/cpu_physic_hotplug.c

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 8577261..39b2392 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -789,4 +789,12 @@ config PVPANIC
  a paravirtualized device provided by QEMU; it lets a virtual machine
  (guest) communicate panic events to the host.
 
+config QEMU_CPU_PHYSIC_HOTPLUG
+   tristate "physically add/remove cpu after cpu onlined/offlined"
+   depends on ACPI_HOTPLUG_CPU
+   ---help---
+ This driver will support physically remove a cpu after
+ it offlined for QEMU automatically. someone may require this feature
+ to do a physically removal for a cpu.
+
 endif # X86_PLATFORM_DEVICES
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index ef0ec74..2e669b0 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -53,3 +53,4 @@ obj-$(CONFIG_APPLE_GMUX)  += apple-gmux.o
 obj-$(CONFIG_CHROMEOS_LAPTOP)  += chromeos_laptop.o
 
 obj-$(CONFIG_PVPANIC)   += pvpanic.o
+obj-$(CONFIG_QEMU_CPU_PHYSIC_HOTPLUG)  += cpu_physic_hotplug.o
diff --git a/drivers/platform/x86/cpu_physic_hotplug.c 
b/drivers/platform/x86/cpu_physic_hotplug.c
new file mode 100644
index 000..a52c042
--- /dev/null
+++ b/drivers/platform/x86/cpu_physic_hotplug.c
@@ -0,0 +1,60 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+MODULE_AUTHOR("Li Guang");
+MODULE_DESCRIPTION("CPU physically hot-plug/unplug Driver");
+MODULE_LICENSE("GPL");
+
+static int cpu_logic_hotplug_notify(struct notifier_block *nfb,
+   unsigned long action, void *hcpu)
+{
+   unsigned int cpu = (unsigned long)hcpu;
+   struct acpi_processor *pr = per_cpu(processors, cpu);
+
+   if (pr) {
+   switch (action) {
+   case CPU_ONLINE:
+   break;
+   case CPU_DEAD:
+   break;
+   default:
+   break;
+   }
+   }
+   return NOTIFY_OK;
+}
+
+static struct notifier_block cpu_logic_hotplug_notifier =
+{
+   .notifier_call = cpu_logic_hotplug_notify,
+};
+
+static int cpu_physic_hotplug_notify(struct notifier_block *nfb,
+unsigned char *s)
+{
+}
+
+static struct notifier_block cpu_physic_hotplug_notifier =
+{
+   .notifier_call = cpu_physic_hotplug_notify,
+};
+
+static int __init cpu_qemu_hotplug_init(void)
+{
+   register_hotcpu_notifier(_logic_hotplug_notifier);
+   register_ec_gpe_notifier(_physic_hotplug_notifier);
+   return 0;
+}
+
+static void __exit cpu_qemu_hotplug_exit(void)
+{
+   unregister_hotcpu_notifier(_logic_hotplug_notifier);
+   unregister_ec_gpe_notifier(_physic_hotplug_notifier);
+}
+
+module_init(cpu_qemu_hotplug_init);
+module_exit(cpu_qemu_hotplug_exit);
-- 
1.7.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFC v2 3/3] cpu_physic_hotplug: register handler for ec space notifier

2013-06-05 Thread liguang

Signed-off-by: liguang 
---
 drivers/platform/x86/cpu_physic_hotplug.c |   30 ++--
 1 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/drivers/platform/x86/cpu_physic_hotplug.c 
b/drivers/platform/x86/cpu_physic_hotplug.c
index a52c042..1cdac1b 100644
--- a/drivers/platform/x86/cpu_physic_hotplug.c
+++ b/drivers/platform/x86/cpu_physic_hotplug.c
@@ -9,6 +9,11 @@ MODULE_AUTHOR("Li Guang");
 MODULE_DESCRIPTION("CPU physically hot-plug/unplug Driver");
 MODULE_LICENSE("GPL");
 
+#define EC_SPACE_CPU_IDX   3
+#define EC_SPACE_CPU_OFFSET4
+#define EC_SPACE_CPU_CMD  2
+#define EC_CPU_EJECT0xE7
+
 static int cpu_logic_hotplug_notify(struct notifier_block *nfb,
unsigned long action, void *hcpu)
 {
@@ -20,6 +25,7 @@ static int cpu_logic_hotplug_notify(struct notifier_block 
*nfb,
case CPU_ONLINE:
break;
case CPU_DEAD:
+   ec_write(EC_SPACE_CPU_CMD, EC_CPU_EJECT);
break;
default:
break;
@@ -34,8 +40,26 @@ static struct notifier_block cpu_logic_hotplug_notifier =
 };
 
 static int cpu_physic_hotplug_notify(struct notifier_block *nfb,
-unsigned char *s)
+unsigned long action, void *s)
 {
+   u8 index = 0, val = 0;
+   bool cpu_state = false;
+   struct acpi_processor *pr;
+
+   ec_read(EC_SPACE_CPU_IDX, );
+   if (index == 0)
+   goto out;
+   pr = per_cpu(processors, index);
+
+   ec_read(EC_SPACE_CPU_OFFSET + index/8, );
+   if (val & 1 << index/8)
+   cpu_state = true;
+
+   if (pr->flags.need_hotplug_init & cpu_state)
+   cpu_up(pr->id);
+
+out:
+return NOTIFY_OK;
 }
 
 static struct notifier_block cpu_physic_hotplug_notifier =
@@ -46,14 +70,14 @@ static struct notifier_block cpu_physic_hotplug_notifier =
 static int __init cpu_qemu_hotplug_init(void)
 {
register_hotcpu_notifier(_logic_hotplug_notifier);
-   register_ec_gpe_notifier(_physic_hotplug_notifier);
+   register_ec_space_notifier(_physic_hotplug_notifier);
return 0;
 }
 
 static void __exit cpu_qemu_hotplug_exit(void)
 {
unregister_hotcpu_notifier(_logic_hotplug_notifier);
-   unregister_ec_gpe_notifier(_physic_hotplug_notifier);
+   unregister_ec_space_notifier(_physic_hotplug_notifier);
 }
 
 module_init(cpu_qemu_hotplug_init);
-- 
1.7.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v7 00/11] per-cgroup cpu-stat

2013-06-05 Thread Tejun Heo

Hello, Glauber.

On Wed, May 29, 2013 at 03:03:11PM +0400, Glauber Costa wrote:
> I am *not* going as far as marking cpuacct deprecated, because I think it
> deserves a special discussion (even though my position in this matter is 
> widely
> known), but all the infrastructure to make it happen is here. But after this,
> it should be a matter of setting a flag (or not).

I'll ensure that cpuacct can't be used with sane_behavior from cgroup
core side, so that it can be deprecated with multiple hierarchies
eventually.

> Through this patchset I am making cpu cgroup provide the same functionality of
> cpuacct, and now with a more clear semantics, I attempt to provide userspace
> with enough information to reconstruct per-container version of files like
> "/proc/stat". In particular, we are interested in knowing the per-cgroup 
> slices
> of user time, system time, wait time, number of processes, and a variety of
> statistics.
> 
> To make sure we can count nr of switches correctly, I am ressurecting one of
> Peter's patches that apparently got nowhere in the past.

I generally agree with the approach but which tree is it based on?
Can you please let me know the base commit so that I can review the
series properly?

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] wait: fix false timeouts when using wait_event_timeout()

2013-06-05 Thread Tejun Heo

Hello, Oleg.

On Wed, Jun 05, 2013 at 09:07:23PM +0200, Oleg Nesterov wrote:
> And in fact, perhaps we can implement wait_event_common() and avoid the
> code duplications?
> 
>   #define __wait_no_timeout(timeout)  \
>   (__builtin_constant_p(timeout) && (timeout) == 
> MAX_SCHEDULE_TIMEOUT)
> 
>   /* uglified signal_pending_state() */
>   #define __wait_signal_pending(state)
> \
>   ((state == TASK_INTERRUPTIBLE) ? signal_pending(current) :  
> \
>(state == TASK_KILLABLE) ? fatal_signal_pending(current) : 
> \
> 0)
> 
>   #define __wait_event_common(wq, condition, state, tout) 
> \
>   ({  
> \
>   DEFINE_WAIT(__wait);
> \
>   long __ret = 0, __tout = tout;  
> \
>   
> \
>   for (;;) {  
> \
>   prepare_to_wait(, &__wait, state);   
> \
>   if (condition) {
> \
>   __ret = __wait_no_timeout(tout) ?: __tout ?: 1; 
> \
>   break;  
> \
>   }   
> \
>   
> \
>   if (__wait_signal_pending(state)) { 
> \
>   __ret = -ERESTARTSYS;   
> \
>   break;  
> \
>   }   
> \
>   
> \
>   if (__wait_no_timeout(tout))
> \
>   schedule(); 
> \
>   else if (__tout)
> \
>   __tout = schedule_timeout(__tout);  
> \
>   else
> \
>   break;  
> \
>   }   
> \
>   finish_wait(, &__wait);  
> \
>   __ret;  
> \
>   })

Heh, yeah, this looks good to me and a lot better than trying to do
the same thing over and over again and ending up with subtle
differences.

> Hmm. I compiled the kernel with the patch below,
> 
>   $ size vmlinux
>  textdata bss dec hex filename
>   -   4978601 2935080 1010483218018513112f0d1 vmlinux
>   +   4977769 2930984 1010483218013585112dd91 vmlinux

Nice.  Provided you went over assembly outputs of at least some
combinations, please feel free to add

 Reviewed-by: Tejun Heo 

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCHv3 1/3] sched_clock: Add support for >32 bit sched_clock

2013-06-05 Thread Stephen Boyd

On 06/05, John Stultz wrote:
> On 06/05/2013 04:54 PM, Stephen Boyd wrote:
> >
> >I've noticed that we probably need to update the mult/shift
> >calculation similar to how clocksources are done. Should we
> >just copy/paste the maxsec calculation code here or do something
> >smarter?
> 
> So, the clocksource calculation has an extra variable it has to
> balance, which is the granularity of ntp adjustments being made
> (since with higher shift values, we can make relatively smaller
> changes by +1 or -1 from mult).
> 
> sched_clock doesn't have to deal with frequency adjustments, so the
> shift value just needs to be high enough to be able to accurately
> express the desired counter frequency.  Too high and you risk
> multiplication overflows if there are large gaps between updates,
> too low though and you run into possible accuracy issues (though I
> hope there isn't much that's using sched_clock for long-term timing
> where slight accuracy issues would be problematic).
> 
> So I think its ok if the sched_clock code uses its own logic for
> calculating the mult/shift pair, since the constraints are different
> then what we expect from timekeeping.
> 

I was thinking perhaps we can do the (1 << bits) / rate thing but
not limit it to 600 seconds. Instead let it be as big as it
actually is? Right now it's actually better to register as a 32
bit clock because the wraparound comes out to be larger when
maxsec is 0.

> 
> >
> >  include/linux/sched_clock.h |  1 +
> >  kernel/time/sched_clock.c   | 41 +++--
> >  2 files changed, 28 insertions(+), 14 deletions(-)
> >
> >diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h
> >index fa7922c..81baaef 100644
> >--- a/include/linux/sched_clock.h
> >+++ b/include/linux/sched_clock.h
> >@@ -15,6 +15,7 @@ static inline void sched_clock_postinit(void) { }
> >  #endif
> >  extern void setup_sched_clock(u32 (*read)(void), int bits, unsigned long 
> > rate);
> >+extern void sched_clock_setup(u64 (*read)(void), int bits, unsigned long 
> >rate);
> 
> Eww. This sort of word-swizzled function names makes patch reviewing a pain.

How about sched_clock_register() or register_sched_clock()?

> 
> I know you're trying to deprecate the old function and provide a
> smooth transition, but would you also consider including follow-on
> patch/patches with this set that converts the existing
> setup_sched_clock usage (at least just the ones in
> drivers/clocksource?) so it doesn't stick around forever?
> 
> And if not, at least add a clear comment here, and maybe some build
> warnings to the old function so the driver owners know to make the
> conversion happen quickly.

Yes I plan to send out the conversion patches and deprecate the
function if this is acceptable. Then we can remove the function
after the merge window is over and all stragglers are converted.

> 
> 
> 
> >  extern unsigned long long (*sched_clock_func)(void);
> >diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
> >index aad1ae6..3478b6d 100644
> >--- a/kernel/time/sched_clock.c
> >+++ b/kernel/time/sched_clock.c
> >@@ -35,24 +36,31 @@ static struct clock_data cd = {
> > .mult   = NSEC_PER_SEC / HZ,
> >  };
> >-static u32 __read_mostly sched_clock_mask = 0x;
> >+static u64 __read_mostly sched_clock_mask;
> >-static u32 notrace jiffy_sched_clock_read(void)
> >+static u64 notrace jiffy_sched_clock_read(void)
> >  {
> >-return (u32)(jiffies - INITIAL_JIFFIES);
> >+return (u64)(jiffies - INITIAL_JIFFIES);
> >  }
> 
> Also, you might add a comment noting you register jiffies w/
> BITS_PER_LONG, to clarify don't have to use jiffies_64 here on 32bit
> systems (despite the u64 cast)?

Sure. Perhaps it is clearer if we don't have the u64 cast here at
all?

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] spi/sirf: fix error propagation in spi_sirfsoc_probe()

2013-06-05 Thread Barry Song

2013/6/6 Barry Song <21cn...@gmail.com>:
> 2013/6/6 Alexey Khoroshilov :
>> If pinctrl_get_select_default() fails, spi_sirfsoc_probe()
>> returns IS_ERR(sspi->p) instead of PTR_ERR(sspi->p).
>>
>> Found by Linux Driver Verification project (linuxtesting.org).
>>
>> Signed-off-by: Alexey Khoroshilov 
>
> Acked-by: Barry Song 

sorry. nack, i think pinctrl_get_select_default(>dev) should
have been dropped in another patch.

>
>> ---
>>  drivers/spi/spi-sirf.c | 5 +++--
>>  1 file changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/spi/spi-sirf.c b/drivers/spi/spi-sirf.c
>> index 0808cd5..94c3920 100644
>> --- a/drivers/spi/spi-sirf.c
>> +++ b/drivers/spi/spi-sirf.c
>> @@ -559,9 +559,10 @@ static int spi_sirfsoc_probe(struct platform_device 
>> *pdev)
>> sspi->bitbang.master->dev.of_node = pdev->dev.of_node;
>>
>> sspi->p = pinctrl_get_select_default(>dev);
>> -   ret = IS_ERR(sspi->p);
>> -   if (ret)
>> +   if (IS_ERR(sspi->p)) {
>> +   ret = PTR_ERR(sspi->p);
>> goto free_master;
>> +   }
>>
>> sspi->clk = clk_get(>dev, NULL);
>> if (IS_ERR(sspi->clk)) {
>> --
>> 1.8.1.2

-barry
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v4 0/5] clk: implement remuxing during set_rate

2013-06-05 Thread Haojian Zhuang

On Mon, May 20, 2013 at 9:28 PM, James Hogan  wrote:
> This patchset adds support for automatic selection of the best parent
> for a clock mux, i.e. the one which can provide the closest clock rate
> to that requested. It can be disabled by a new CLK_SET_RATE_NO_REPARENT
> flag (which is set for all uses of clk_register_mux().
>
> This works by way of adding a new op, determine_rate, similar to
> round_rate but with an extra parameter to allow the clock driver to
> optionally select a different parent clock. This is used in
> clk_calc_new_rates to decide whether to initiate a set_parent operation.
>
> Changes in v4:
>
> * rebased on clk-next ("clk: sun5i: Add compatibles for Allwinner A13").
> * replace __clk_set_parent_no_recalc with __clk_set_parent.
> * never pass NULL to determine_rate's best_parent_clk parameter, and
>   slight refactor of __clk_round_rate to use local copy of clk->parent.
> * a few new comments around use of clk::new_child.
> * new patch (patch 2) split out of patch 3 to avoid having to declare
>   static __clk_set_parent() at the top of clk.c, and to ease readability
>   of patch 3.
>
> Changes in v3:
>
> * rebased on v3.10-rc1.
> * remove double underscore prefix from clk_get_parent_by_index()
> * store new_parent_index in struct clk too (calculated from
>   clk_fetch_parent_index, and passed through __clk_set_parent_no_recalc
>   to __clk_set_parent).
> * allow determine_rate to satisfy recalc_rate check in __clk_init.
> * rename/invert CLK_SET_RATE_REMUX to CLK_SET_RATE_NO_REPARENT and move
>   to patch 3.
> * patch 3: add CLK_SET_RATE_NO_REPARENT flag to all callers of
>   clk_register_mux. If you don't mind your clocks being reparented in
>   response to set_rate please let me know and I'll drop the relevant
>   portion of the patch.
>
> Changes in v2:
>
> I've moved the mux determine_rate implementation into a core helper, but
> I haven't pushed it fully into the core, as I think it just wouldn't
> work correctly for more complex clocks, e.g. if you (theoretically) had
> a combined mux and divide, you'd want to intercept the determine_rate
> and ask for a larger rate from the parent clocks, then return the
> divided rate. This should be possible by wrapping the mux determine_rate
> helper.
>
> Patch 1 still exports the __clk_get_parent_by_index as it seems like it
> might be a useful thing for clock implementations to have access to if
> they ever wanted to do something more fancy with changing clock parents.
>
> I haven't made any attempt to implement the atomic set_parent+set_rate
> as I don't have hardware that could take proper advantage of it, but it
> shouldn't be too difficult for others to implement if they wanted since
> they're fairly close to one another (in clk_change_rate()).
>
> * switched to using new determine_rate op rather than adding an argument
>   to round_rate.
> * moved mux implementation into a single helper which should be usable
>   from more complex clocks which can mux.
> * rewrite main implementation so that no changes are made until after
>   the PRE notifications have been sent, and in a way that should ensure
>   correct notifications without duplicates, and I think should be safe
>   in the event of a notification failing.
> * various tidy ups and fixes.
>
> James Hogan (5):
>   clk: abstract parent cache
>   clk: move some parent related functions upwards
>   clk: add support for clock reparent on set_rate
>   clk: add CLK_SET_RATE_NO_REPARENT flag
>   clk: clk-mux: implement remuxing on set_rate
>
>  Documentation/clk.txt|   4 +
>  arch/arm/mach-imx/clk.h  |   5 +-
>  drivers/clk/clk-mux.c|   1 +
>  drivers/clk/clk.c| 416 
> ++-
>  drivers/clk/mmp/clk-mmp2.c   |  39 ++--
>  drivers/clk/mmp/clk-pxa168.c |  40 ++--
>  drivers/clk/mmp/clk-pxa910.c |  31 ++-
>  drivers/clk/mxs/clk.h|   4 +-
>  drivers/clk/samsung/clk.h|   2 +-
>  drivers/clk/spear/spear1310_clock.c  | 179 +++
>  drivers/clk/spear/spear1340_clock.c  |  97 
>  drivers/clk/spear/spear3xx_clock.c   |  57 +++--
>  drivers/clk/spear/spear6xx_clock.c   |  35 +--
>  drivers/clk/sunxi/clk-sunxi.c|   3 +-
>  drivers/clk/tegra/clk-tegra114.c |  36 ++-
>  drivers/clk/tegra/clk-tegra20.c  |   6 +-
>  drivers/clk/tegra/clk-tegra30.c  |  33 ++-
>  drivers/clk/versatile/clk-vexpress.c |   4 +-
>  include/linux/clk-private.h  |   3 +
>  include/linux/clk-provider.h |  12 +
>  20 files changed, 614 insertions(+), 393 deletions(-)
>
> --
> 1.8.1.2

Tested-by: Haojian Zhuang 

Pass test on MMP & Hisilicon SoC.

Regards
Haojian
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] corner cases of open() on procfs symlinks

2013-06-05 Thread Linus Torvalds

On Thu, Jun 6, 2013 at 10:20 AM, Al Viro  wrote:
> I'm not sure whether to treat that as a bug or as a weird misfeature
> enshrined in userland ABI:
> open("/tmp", O_CREAT, 0) => -EISDIR // LAST_NORM case
> open("/", O_CREAT, 0) => -EISDIR// LAST_ROOT
> open(".", O_CREAT, 0) => -EISDIR// LAST_DOT
> open("..", O_CREAT, 0) => -EISDIR   // LAST_DOTDOT
> open("/proc/self/cwd", O_CREAT, 0) => success   // LAST_BIND
> open("/proc/self/cwd/", O_CREAT, 0) => -EISDIR  // trailing slashes

Ok, that looks buggy. O_CREAT should definitely return EISDIR for
/proc/self/cwd too, since it's a directory. I don't think the
O_RDWR/O_WRONLY thing should matter.

>I would obviously
> like to do that - do_last() is far too convoluted as it is; the only
> question is whether we can change the first weirdness...  Comments?

Exactly which cases does that change? I have no objections if it's
only the "LAST_BIND" case that now starts returning EISDIR. Is there
anything else it affects?

That said, obviously if something breaks, we'd have to revert it, and
as a cleanup rather than some serious bug (ie this doesn't cause
crashes or security issues), I suspect this should wait until 3.11
regardless. No?

   Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] spi/sirf: fix error propagation in spi_sirfsoc_probe()

2013-06-05 Thread Barry Song

2013/6/6 Alexey Khoroshilov :
> If pinctrl_get_select_default() fails, spi_sirfsoc_probe()
> returns IS_ERR(sspi->p) instead of PTR_ERR(sspi->p).
>
> Found by Linux Driver Verification project (linuxtesting.org).
>
> Signed-off-by: Alexey Khoroshilov 

Acked-by: Barry Song 

> ---
>  drivers/spi/spi-sirf.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/spi/spi-sirf.c b/drivers/spi/spi-sirf.c
> index 0808cd5..94c3920 100644
> --- a/drivers/spi/spi-sirf.c
> +++ b/drivers/spi/spi-sirf.c
> @@ -559,9 +559,10 @@ static int spi_sirfsoc_probe(struct platform_device 
> *pdev)
> sspi->bitbang.master->dev.of_node = pdev->dev.of_node;
>
> sspi->p = pinctrl_get_select_default(>dev);
> -   ret = IS_ERR(sspi->p);
> -   if (ret)
> +   if (IS_ERR(sspi->p)) {
> +   ret = PTR_ERR(sspi->p);
> goto free_master;
> +   }
>
> sspi->clk = clk_get(>dev, NULL);
> if (IS_ERR(sspi->clk)) {
> --
> 1.8.1.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: stop_machine lockup issue in 3.9.y.

2013-06-05 Thread Eric Dumazet

On Wed, 2013-06-05 at 14:11 -0700, Tejun Heo wrote:
> (cc'ing wireless crowd, tglx and Ingo.  The original thread is at
>  http://thread.gmane.org/gmane.linux.kernel/1500158/focus=55005 )
> 
> Hello, Ben.
> 
> On Wed, Jun 05, 2013 at 01:58:31PM -0700, Ben Greear wrote:
> > Hmm, wonder if I found it.  I previously saw times where it appears
> > jiffies does not increment.  __do_softirq has a break-out based on
> > jiffies timeout.  Maybe that is failing to get us out of __do_softirq
> > in my lockup case because for whatever reason the system cannot update
> > jiffies in this case?
> > 
> > I added this (probably whitespace damaged) hack and now I have not been
> > able to reproduce the problem.
> 
> Ah, nice catch. :)
> 
> > diff --git a/kernel/softirq.c b/kernel/softirq.c
> > index 14d7758..621ea3b 100644
> > --- a/kernel/softirq.c
> > +++ b/kernel/softirq.c
> > @@ -212,6 +212,7 @@ asmlinkage void __do_softirq(void)
> > unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
> > int cpu;
> > unsigned long old_flags = current->flags;
> > +   unsigned long loops = 0;
> > 
> > /*
> >  * Mask out PF_MEMALLOC s current task context is borrowed for the
> > @@ -241,6 +242,7 @@ restart:
> > unsigned int vec_nr = h - softirq_vec;
> > int prev_count = preempt_count();
> > 
> > +   loops++;
> > kstat_incr_softirqs_this_cpu(vec_nr);
> > 
> > trace_softirq_entry(vec_nr);
> > @@ -265,7 +267,7 @@ restart:
> > 
> > pending = local_softirq_pending();
> > if (pending) {
> > -   if (time_before(jiffies, end) && !need_resched())
> > +   if (time_before(jiffies, end) && !need_resched() && (loops 
> > < 500))
> > goto restart;
> 
> So, softirq most likely kicked off from ath9k is rescheduling itself
> to the extent where it ends up locking out the CPU completely.  The
> problem is usually okay because the processing would break out in 2ms
> but as jiffies is stopped in this case with all other CPUs trapped in
> stop_machine, the loop never breaks and the machine hangs.  While
> adding the counter limit probably isn't a bad idea, softirq requeueing
> itself indefinitely sounds pretty buggy.
> 
> ath9k people, do you guys have any idea what's going on?  Why would
> softirq repeat itself indefinitely?
> 
> Ingo, Thomas, we're seeing a stop_machine hanging because
> 
> * All other CPUs entered IRQ disabled stage.  Jiffies is not being
>   updated.
> 
> * The last CPU get caught up executing softirq indefinitely.  As
>   jiffies doesn't get updated, it never breaks out of softirq
>   handling.  This is a deadlock.  This CPU won't break out of softirq
>   handling unless jiffies is updated and other CPUs can't do anything
>   until this CPU enters the same stop_machine stage.
> 
> Ben found out that breaking out of softirq handling after certain
> number of repetitions makes the issue go away, which isn't a proper
> fix but we might want anyway.  What do you guys think?
> 

Interesting

Before 3.9 and commit c10d73671ad30f5469
("softirq: reduce latencies") we used to limit the __do_softirq() loop
to 10.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFC 1/3] drivers/platform/x86: add cpu physically hotplug driver

2013-06-05 Thread liguang

this driver will support cpu phyical add/removal automatically
after online/offline. if cpu hotpluged, cpu will not
online automatically, and for cpu offline, we try to
do actually eject if allowed for cpu like
"echo 1 > /sys/bus/acpi/devices/LNXCPU\:0X/eject"
this "echo ..." is only present for recent kernel
(sorry, can't figure out since when), for a little
older kernel, there's not such approach AFAICS.

Signed-off-by: liguang 
---
 drivers/platform/x86/Kconfig  |8 
 drivers/platform/x86/Makefile |1 +
 drivers/platform/x86/cpu_physic_hotplug.c |   60 +
 3 files changed, 69 insertions(+), 0 deletions(-)
 create mode 100644 drivers/platform/x86/cpu_physic_hotplug.c

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 8577261..39b2392 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -789,4 +789,12 @@ config PVPANIC
  a paravirtualized device provided by QEMU; it lets a virtual machine
  (guest) communicate panic events to the host.
 
+config QEMU_CPU_PHYSIC_HOTPLUG
+   tristate "physically add/remove cpu after cpu onlined/offlined"
+   depends on ACPI_HOTPLUG_CPU
+   ---help---
+ This driver will support physically remove a cpu after
+ it offlined for QEMU automatically. someone may require this feature
+ to do a physically removal for a cpu.
+
 endif # X86_PLATFORM_DEVICES
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index ef0ec74..2e669b0 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -53,3 +53,4 @@ obj-$(CONFIG_APPLE_GMUX)  += apple-gmux.o
 obj-$(CONFIG_CHROMEOS_LAPTOP)  += chromeos_laptop.o
 
 obj-$(CONFIG_PVPANIC)   += pvpanic.o
+obj-$(CONFIG_QEMU_CPU_PHYSIC_HOTPLUG)  += cpu_physic_hotplug.o
diff --git a/drivers/platform/x86/cpu_physic_hotplug.c 
b/drivers/platform/x86/cpu_physic_hotplug.c
new file mode 100644
index 000..a52c042
--- /dev/null
+++ b/drivers/platform/x86/cpu_physic_hotplug.c
@@ -0,0 +1,60 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+MODULE_AUTHOR("Li Guang");
+MODULE_DESCRIPTION("CPU physically hot-plug/unplug Driver");
+MODULE_LICENSE("GPL");
+
+static int cpu_logic_hotplug_notify(struct notifier_block *nfb,
+   unsigned long action, void *hcpu)
+{
+   unsigned int cpu = (unsigned long)hcpu;
+   struct acpi_processor *pr = per_cpu(processors, cpu);
+
+   if (pr) {
+   switch (action) {
+   case CPU_ONLINE:
+   break;
+   case CPU_DEAD:
+   break;
+   default:
+   break;
+   }
+   }
+   return NOTIFY_OK;
+}
+
+static struct notifier_block cpu_logic_hotplug_notifier =
+{
+   .notifier_call = cpu_logic_hotplug_notify,
+};
+
+static int cpu_physic_hotplug_notify(struct notifier_block *nfb,
+unsigned char *s)
+{
+}
+
+static struct notifier_block cpu_physic_hotplug_notifier =
+{
+   .notifier_call = cpu_physic_hotplug_notify,
+};
+
+static int __init cpu_qemu_hotplug_init(void)
+{
+   register_hotcpu_notifier(_logic_hotplug_notifier);
+   register_ec_gpe_notifier(_physic_hotplug_notifier);
+   return 0;
+}
+
+static void __exit cpu_qemu_hotplug_exit(void)
+{
+   unregister_hotcpu_notifier(_logic_hotplug_notifier);
+   unregister_ec_gpe_notifier(_physic_hotplug_notifier);
+}
+
+module_init(cpu_qemu_hotplug_init);
+module_exit(cpu_qemu_hotplug_exit);
-- 
1.7.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFC 2/3] ec: add ec space notifier

2013-06-05 Thread liguang

add a notifier for anyone who are instresting in
ec space changing.

Signed-off-by: liguang 
---
 drivers/acpi/ec.c|   32 
 include/linux/acpi.h |2 ++
 2 files changed, 34 insertions(+), 0 deletions(-)

diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index edc0081..dee3417 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -124,6 +124,35 @@ static int EC_FLAGS_MSI; /* Out-of-spec MSI controller */
 static int EC_FLAGS_VALIDATE_ECDT; /* ASUStec ECDTs need to be validated */
 static int EC_FLAGS_SKIP_DSDT_SCAN; /* Not all BIOS survive early DSDT scan */
 
+/* notifier chain for who are intresting in ec space changing */
+static RAW_NOTIFIER_HEAD(ec_space_chain);
+
+int __ref register_ec_space_notifier(struct notifier_block *nb)
+{
+int ret;
+
+ret = raw_notifier_chain_register(_space_chain, nb);
+
+return ret;
+}
+EXPORT_SYMBOL(register_ec_space_notifier);
+
+void __ref unregister_ec_space_notifier(struct notifier_block *nb)
+{
+
+raw_notifier_chain_unregister(_space_chain, nb);
+}
+EXPORT_SYMBOL(unregister_ec_space_notifier);
+
+static int ec_space_notify(void *data)
+{
+int ret;
+
+ret = __raw_notifier_call_chain(_space_chain, 0, data, -1, NULL);
+
+ return notifier_to_errno(ret);
+}
+
 /* --
  Transaction Management
-- 
*/
@@ -638,6 +667,9 @@ static u32 acpi_ec_gpe_handler(acpi_handle gpe_device,
wake_up(>wait);
ec_check_sci(ec, acpi_ec_read_status(ec));
}
+
+   ec_space_notify(data);
+
return ACPI_INTERRUPT_HANDLED | ACPI_REENABLE_GPE;
 }
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 17b5b59..4fe2247 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -158,6 +158,8 @@ extern int ec_transaction(u8 command,
   const u8 *wdata, unsigned wdata_len,
   u8 *rdata, unsigned rdata_len);
 extern acpi_handle ec_get_handle(void);
+extern int register_ec_space_notifier(struct notifier_block *nb);
+extern void unregister_ec_space_notifier(struct notifier_block *nb);
 
 #if defined(CONFIG_ACPI_WMI) || defined(CONFIG_ACPI_WMI_MODULE)
 
-- 
1.7.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFC 3/3] cpu_physic_hotplug: register handler for ec space notifier

2013-06-05 Thread liguang

Signed-off-by: liguang 
---
 drivers/platform/x86/cpu_physic_hotplug.c |   27 +--
 1 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/x86/cpu_physic_hotplug.c 
b/drivers/platform/x86/cpu_physic_hotplug.c
index a52c042..a84c999 100644
--- a/drivers/platform/x86/cpu_physic_hotplug.c
+++ b/drivers/platform/x86/cpu_physic_hotplug.c
@@ -9,6 +9,9 @@ MODULE_AUTHOR("Li Guang");
 MODULE_DESCRIPTION("CPU physically hot-plug/unplug Driver");
 MODULE_LICENSE("GPL");
 
+#define EC_SPACE_CPU_IDX   3
+#define EC_SPACE_CPU_OFFSET4
+
 static int cpu_logic_hotplug_notify(struct notifier_block *nfb,
unsigned long action, void *hcpu)
 {
@@ -36,6 +39,26 @@ static struct notifier_block cpu_logic_hotplug_notifier =
 static int cpu_physic_hotplug_notify(struct notifier_block *nfb,
 unsigned char *s)
 {
+   u8 index = 0, val = 0;
+   bool cpu_state = false;
+   struct acpi_processor *pr;
+
+   ec_read(EC_SPACE_CPU_IDX, );
+   if (index == 0)
+   goto out;
+   pr = per_cpu(processors, index);
+
+   ec_read(EC_SPACE_CPU_OFFSET + index/8, );
+   if (val & 1 << index/8)
+   cpu_state = true;
+
+   if (pr->flags.need_hotplug_init & cpu_state)
+   cpu_up(pr->id);
+   else
+   cpu_down(pr->id);
+
+out:
+return NOTIFY_OK;
 }
 
 static struct notifier_block cpu_physic_hotplug_notifier =
@@ -46,14 +69,14 @@ static struct notifier_block cpu_physic_hotplug_notifier =
 static int __init cpu_qemu_hotplug_init(void)
 {
register_hotcpu_notifier(_logic_hotplug_notifier);
-   register_ec_gpe_notifier(_physic_hotplug_notifier);
+   register_ec_space_notifier(_physic_hotplug_notifier);
return 0;
 }
 
 static void __exit cpu_qemu_hotplug_exit(void)
 {
unregister_hotcpu_notifier(_logic_hotplug_notifier);
-   unregister_ec_gpe_notifier(_physic_hotplug_notifier);
+   unregister_ec_space_notifier(_physic_hotplug_notifier);
 }
 
 module_init(cpu_qemu_hotplug_init);
-- 
1.7.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFC 0/3] add cpu physically hotplug driver

2013-06-05 Thread liguang

This patch-set try to support physically hot-plug/unplug
a cpu automatically, that is:
if you offline a cpu, it will automatically actually remove
a cpu, and if you hot-plug a cpu, then it will online this
cpu automatically.
so, offline is just like eject, but eject attribute seems not
available since recent kernel(can't figure out when), with
this driver, if allowed, it will trigger a eject cpu process.
and for automatically online, it was said there are objections,
don't know the reason, so, send this patch-set boldly.

of course, this approach is for QEMU 's hotplug cpu emulation 
only, but not limited, if someone like to explore ec space to
implment cpu hot-plug/unplug for real platform please
feel free to continue.

Li Guang (3)
 drivers/platform/x86: add cpu physically hotplug driver
 ec: add ec space notifier
 cpu_physic_hotplug: register handler for ec space notifier

drivers/acpi/ec.c | 32 
drivers/platform/x86/Kconfig  |  8 
drivers/platform/x86/Makefile |  1 +
drivers/platform/x86/cpu_physic_hotplug.c | 87 +
include/linux/acpi.h  |  2 ++
5 files changed, 128 insertions(+), 2 deletions(-)
 create mode 100644 drivers/platform/x86/cpu_physic_hotplug.c

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] usbnet: improve/fix status interrupt endpoint interval

2013-06-05 Thread Ming Lei

On Thu, Jun 6, 2013 at 12:34 AM, Andreas Mohr  wrote:
> Hi,
>
> On Wed, Jun 05, 2013 at 09:22:25AM +0800, Ming Lei wrote:
>> On Wed, Jun 5, 2013 at 2:28 AM, Andreas Mohr  wrote:
>> >
>> > From 307685fe8e6dfc8181e30167b9c31479332cb22f Mon Sep 17 00:00:00 2001
>> > From: Andreas Mohr 
>> > Date: Sun, 2 Jun 2013 20:37:05 +0200
>> > Subject: [PATCH] usbnet: improve/fix status interrupt endpoint interval
>> >  tweaking.
>> >
>> > - failed to take super-speed into account
>> > - <= full-speed seems to have wrong value (specified as frames [ms],
>> >   thus 3 is not suitable to achieve 8ms)
>>
>> The above change is correct.
>>
>> >   Value 8 now managed to reduce powertop wakeups from ~ 540 to ~ 155
>>
>> It means that your device only returns current link status instead of link
>> change. IMO, it isn't a good behaviour for the device.
>
> I don't quite understand that.

It is only concluded by the data you provided,  when you get ~540 wakeups,
that means basically device will return data for each polling from HC.

Also I am wondering why you get ~540 wakeups, instead of ~360(330 + 30)
(30 is guessed from ~155 wakup in 8ms interval)

Did you check intr_complete() returns OK every time?

> The way I see it is that there's the "20 times same value" averaging,
> and once that was successful, a link change gets communicated
> (usbnet_link_change()). Thus that merely results in a *delay*
> in signalling the link change...
>
>> In fact, you still can increase the period only for your device, for example,
>> 128ms/256ms/512ms should be accepted.
>
> Possibly.
>
>> > - add detailed docs and question marks about current practice
>>
>> But the doc need to be fixed.
>
> Hmm.
>
>> > /* avoid 1 msec chatter:  min 8 msec poll rate */
>> > +   /* High/SuperSpeed expresses intervals in microframes
>> > +* (in logarithmic encoding, PRIOR to encoding in URB)
>> > +* rather than frames.
>> > +* Thus, for >= HighSpeed: == X [microframes] * 125us [-> 8ms],
>> > +* <= FullSpeed: == X [ms] [-> 8ms].
>> > +* Finally, it's questionable whether we'll even get away unscathed
>> > +* with doing such rate tweaking at all:
>> > +* bInterval value is declared as being a hard demand by a device
>>
>> It isn't a hard demand, which only means the poll interval by which HC
>> sends IN token to device.
>
> I believe this number is meant to be a hard demand by the *device*,
> since a device is the authoritative party to know best about its
> own servicing requirements.

Actually, just see quirks for USB devices, there are many devices which
isn't worthy of trust, :-)

Also some problems should have been reported on current interval
value(larger than interval of endpoint) if it was hard demand, but luckly
looks no such report found.

As I said before, the link change is a low frequency event, so longer
interval used by usbnet driver should be OK, right?

> Or, IOW, the thing that is a USB descriptor is to be seen as a *protocol*
> where a device signals its requirements (hopefully accurately, though!).
> And if it indicates a 1ms bInterval (which is "the requested maximum(!!)
> number of milliseconds between transaction attempts" [lvr usbfaq]),

USB spec 2.0 doesn't say it is a maximum period between transactions,
and only mentions that is a "desired bus access period", see "5.7.4
Interrupt Transfer Bus Access Constraints".

> then one could argue that the servicing party (the kernel) damn well
> ought to follow through (unless it reliably knows that it can violate
> some parts of these demands without getting caught).
>
>> > +* in order to guarantee having its I/O needs serviced properly...
>> > +* if we don't do this, then... [overruns], [fire], [apocalypse]?
>>
>> Not so serious, if one packet is ready, the late poll from HC may still
>> get the packet since device can buffer the packet, but if it is too late,
>> the successor packet might be missed by device.
>
> Is proper damage-less (overflows...) handling here a promise/guarantee
> that's made by the USB specs?

Even there is overflow, it happens inside device, and it depends on
implementation of device itself.

> Otherwise I wouldn't be so confident that a device is acting this way ;)

If so, you can use the dev->status->desc.bInterval, so you may complain
too much wakeup and CPU power consumed, and we need leverage.

>
>> For usbnet, generally speaking, the interrupt pipe is for polling the
>> link change, which is a very low frequency event, so you don't need to
>> worry about missing events if the interval is increased.
>
> Yeah, but then those status bits also contain other error info for every 
> packet
> processed, thus it's also very useful to achieve polling that's frequent
> enough to properly grab info for all transferred ether frames, rather
> than merely concentrating on link change info.

Actually, most of usbnet drivers only use interrupt pipe to retrieve link
change(asix, smsc, ...). But if

Re: [PATCH v2 3/4] perf report: Add --time-filter option

2013-06-05 Thread David Ahern


On 6/2/13 10:44 PM, Namhyung Kim wrote:

From: Namhyung Kim 

The --time-filter option is for limiting samples within a range of
time.  A time range looks like - and at most one of them
can be omitted.  This can be useful when analyzing a part of a huge
data only.

Cc: Joonsoo Kim 
Cc: David Ahern 
Signed-off-by: Namhyung Kim 


Acked-Tested-by: David Ahern 



---
  tools/perf/Documentation/perf-report.txt |  7 +++
  tools/perf/builtin-report.c  | 27 +++
  2 files changed, 34 insertions(+)

diff --git a/tools/perf/Documentation/perf-report.txt 
b/tools/perf/Documentation/perf-report.txt
index 66dab7410c1d..04a96f657ca7 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -214,6 +214,13 @@ OPTIONS
Do not show entries which have an overhead under that percent.
(Default: 0).

+-X::
+--time-filter::
+   Report samples within a range of time only. A time range can be given
+   like 'time1-time2' and treated as a start time and end time
+   respectively. The time format is like ".". Either of time1
+   or time2 can be omitted.
+
  SEE ALSO
  
  linkperf:perf-stat[1], linkperf:perf-annotate[1]
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index ca98d34cd58b..e09e1bdb1401 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -53,6 +53,7 @@ struct perf_report {
const char  *cpu_list;
const char  *symbol_filter_str;
float   min_percent;
+   u64 time_start, time_end;
DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
  };

@@ -318,6 +319,12 @@ static int process_sample_event(struct perf_tool *tool,
if (rep->cpu_list && !test_bit(sample->cpu, rep->cpu_bitmap))
return 0;

+   if (rep->time_start && rep->time_start > sample->time)
+   return 0;
+
+   if (rep->time_end && rep->time_end < sample->time)
+   return 0;
+
if (sort__mode == SORT_MODE__BRANCH) {
ret = perf_report__add_branch_hist_entry(tool, , sample,
 evsel, machine);
@@ -714,6 +721,24 @@ parse_percent_limit(const struct option *opt, const char 
*str,
return 0;
  }

+static int
+parse_time_filter(const struct option *opt, const char *str,
+ int unset __maybe_unused)
+{
+   struct perf_report *rep = opt->value;
+   char *sep;
+
+   sep = strchr(str, '-');
+   if (sep == NULL)
+   return parse_nsec_time(str, >time_start);
+   else if (sep == str)
+   return parse_nsec_time(++str, >time_end);
+
+   *sep++ = '\0';
+   return parse_nsec_time(str, >time_start) ||
+   parse_nsec_time(sep , >time_end);
+}
+
  int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
  {
struct perf_session *session;
@@ -825,6 +850,8 @@ int cmd_report(int argc, const char **argv, const char 
*prefix __maybe_unused)
OPT_BOOLEAN(0, "mem-mode", _mode, "mem access profile"),
OPT_CALLBACK(0, "percent-limit", , "percent",
 "Don't show entries under that percent", 
parse_percent_limit),
+   OPT_CALLBACK('X', "time-filter", , "time[-time]",
+"Only display entries in the time range", 
parse_time_filter),
OPT_END()
};




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/4] perf script: Add --time-filter option

2013-06-05 Thread David Ahern


On 6/2/13 10:44 PM, Namhyung Kim wrote:

From: Namhyung Kim 

The --time-filter option is for limiting samples within a range of
time.  A time range looks like - and at most one of them
can be omitted.  For instance:

   $ perf script --time-filter -2178446.12
   ...
  xchat  1772 [002] 2178446.070330: sched:sched_switch: prev_comm=xchat 
prev_pid=177
swapper 0 [002] 2178446.070338: power:cpu_idle: state=4 cpu_id=2
swapper 0 [001] 2178446.091952: sched:sched_wakeup: comm=synergys 
pid=1488 prio=
swapper 0 [001] 2178446.091958: power:cpu_idle: state=4294967295 
cpu_id=1
swapper 0 [001] 2178446.091970: sched:sched_switch: 
prev_comm=swapper/1 prev_pid
   synergys  1488 [001] 2178446.091995: sched:sched_switch: 
prev_comm=synergys prev_pid=
swapper 0 [001] 2178446.092003: power:cpu_idle: state=4 cpu_id=1
swapper 0 [001] 2178446.116997: sched:sched_wakeup: comm=synergys 
pid=1488 prio=
swapper 0 [001] 2178446.117004: power:cpu_idle: state=4294967295 
cpu_id=1
swapper 0 [001] 2178446.117016: sched:sched_switch: 
prev_comm=swapper/1 prev_pid
   synergys  1488 [001] 2178446.117040: sched:sched_switch: 
prev_comm=synergys prev_pid=
swapper 0 [001] 2178446.117048: power:cpu_idle: state=4 cpu_id=1

Above example only displays samples which have a timestamp before
2178446.12.

Cc: Joonsoo Kim 
Cc: David Ahern 
Signed-off-by: Namhyung Kim 


Acked-Tested-by: David Ahern 



---
  tools/perf/Documentation/perf-script.txt |  7 +++
  tools/perf/builtin-script.c  | 32 
  2 files changed, 39 insertions(+)

diff --git a/tools/perf/Documentation/perf-script.txt 
b/tools/perf/Documentation/perf-script.txt
index e9cbfcddfa3f..c4994c5f27ff 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -203,6 +203,13 @@ OPTIONS
  --show-kernel-path::
Try to resolve the path of [kernel.kallsyms]

+-X::
+--time-filter::
+   Display samples within a range of time only. A time range can be given
+   like 'time1-time2' and treated as a start time and end time
+respectively. The time format is like ".". Either of time1
+   or time2 can be omitted.
+
  SEE ALSO
  
  linkperf:perf-record[1], linkperf:perf-script-perl[1],
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 92d4658f56fb..d598765e59cb 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -28,6 +28,13 @@ static bool  system_wide;
  static const char *cpu_list;
  static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);

+struct time_range {
+   u64 start;
+   u64 end;
+};
+
+static struct time_range trange;
+
  enum perf_output_field {
PERF_OUTPUT_COMM= 1U << 0,
PERF_OUTPUT_TID = 1U << 1,
@@ -510,6 +517,12 @@ static int process_sample_event(struct perf_tool *tool 
__maybe_unused,
if (cpu_list && !test_bit(sample->cpu, cpu_bitmap))
return 0;

+   if (trange.start && trange.start > sample->time)
+   return 0;
+
+   if (trange.end && trange.end < sample->time)
+   return 0;
+
scripting_ops->process_event(event, sample, evsel, machine, );

evsel->hists.stats.total_period += sample->period;
@@ -1236,6 +1249,23 @@ static int have_cmd(int argc, const char **argv)
return 0;
  }

+static int
+parse_time_filter(const struct option *opt, const char *str,
+ int unset __maybe_unused)
+{
+   struct time_range *time_range = opt->value;
+   char *sep = strchr(str, '-');
+
+   if (sep == NULL)
+   return parse_nsec_time(str, _range->start);
+   else if (sep == str)
+   return parse_nsec_time(++str, _range->end);
+
+   *sep++ = '\0';
+   return parse_nsec_time(str, _range->start) ||
+   parse_nsec_time(sep, _range->end);
+}
+
  int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
  {
bool show_full_info = false;
@@ -1286,6 +1316,8 @@ int cmd_script(int argc, const char **argv, const char 
*prefix __maybe_unused)
"display extended information from perf.data file"),
OPT_BOOLEAN('\0', "show-kernel-path", _conf.show_kernel_path,
"Show the path of [kernel.kallsyms]"),
+   OPT_CALLBACK('X', "time-filter", , "time[-time]",
+"Only display entries in the time range", 
parse_time_filter),
OPT_END()
};
const char * const script_usage[] = {



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1994 matches

Mail list logo