from:"Tom Gundersen"

Re: [PATCH] net/unix: drop obsolete fd-recursion limits

2017-07-17 Thread Tom Gundersen

On Mon, Jul 17, 2017 at 11:35 AM, David Herrmann <dh.herrm...@gmail.com> wrote:
> All unix sockets now account inflight FDs to the respective sender.
> This was introduced in:
>
> commit 712f4aad406bb1ed67f3f98d04c044191f0ff593
> Author: willy tarreau <w...@1wt.eu>
> Date:   Sun Jan 10 07:54:56 2016 +0100
>
> unix: properly account for FDs passed over unix sockets
>
> and further refined in:
>
> commit 415e3d3e90ce9e18727e8843ae343eda5a58fad6
> Author: Hannes Frederic Sowa <han...@stressinduktion.org>
> Date:   Wed Feb 3 02:11:03 2016 +0100
>
> unix: correctly track in-flight fds in sending process user_struct
>
> Hence, regardless of the stacking depth of FDs, the total number of
> inflight FDs is limited, and accounted. There is no known way for a
> local user to exceed those limits or exploit the accounting.
>
> Furthermore, the GC logic is independent of the recursion/stacking depth
> as well. It solely depends on the total number of inflight FDs,
> regardless of their layout.
>
> Lastly, the current `recursion_level' suffers a TOCTOU race, since it
> checks and inherits depths only at queue time. If we consider `A<-B' to
> mean `queue-B-on-A', the following sequence circumvents the recursion
> level easily:
>
> A<-B
>B<-C
>   C<-D
>  ...
>Y<-Z
>
> resulting in:
>
> A<-B<-C<-...<-Z
>
> With all of this in mind, lets drop the recursion limit. It has no
> additional security value, anymore. On the contrary, it randomly
> confuses message brokers that try to forward file-descriptors, since
> any sendmsg(2) call can fail spuriously with ETOOMANYREFS if a client
> maliciously modifies the FD while inflight.
>
> Cc: Alban Crequy <alban.cre...@collabora.co.uk>
> Cc: Simon McVittie <simon.mcvit...@collabora.co.uk>
> Signed-off-by: David Herrmann <dh.herrm...@gmail.com>
Reviewed-by: Tom Gundersen <t...@jklm.no>
> ---
>  include/net/af_unix.h |  1 -
>  net/unix/af_unix.c| 24 +---
>  2 files changed, 1 insertion(+), 24 deletions(-)
>
> diff --git a/include/net/af_unix.h b/include/net/af_unix.h
> index 678e4d6fa317..3b3194b2fc65 100644
> --- a/include/net/af_unix.h
> +++ b/include/net/af_unix.h
> @@ -58,7 +58,6 @@ struct unix_sock {
> struct list_headlink;
> atomic_long_t   inflight;
> spinlock_t  lock;
> -   unsigned char   recursion_level;
> unsigned long   gc_flags;
>  #define UNIX_GC_CANDIDATE  0
>  #define UNIX_GC_MAYBE_CYCLE1
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 7b52a380d710..5c53f22d62e8 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -1528,26 +1528,13 @@ static inline bool too_many_unix_fds(struct 
> task_struct *p)
> return false;
>  }
>
> -#define MAX_RECURSION_LEVEL 4
> -
>  static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
>  {
> int i;
> -   unsigned char max_level = 0;
>
> if (too_many_unix_fds(current))
> return -ETOOMANYREFS;
>
> -   for (i = scm->fp->count - 1; i >= 0; i--) {
> -   struct sock *sk = unix_get_socket(scm->fp->fp[i]);
> -
> -   if (sk)
> -   max_level = max(max_level,
> -   unix_sk(sk)->recursion_level);
> -   }
> -   if (unlikely(max_level > MAX_RECURSION_LEVEL))
> -   return -ETOOMANYREFS;
> -
> /*
>  * Need to duplicate file references for the sake of garbage
>  * collection.  Otherwise a socket in the fps might become a
> @@ -1559,7 +1546,7 @@ static int unix_attach_fds(struct scm_cookie *scm, 
> struct sk_buff *skb)
>
> for (i = scm->fp->count - 1; i >= 0; i--)
> unix_inflight(scm->fp->user, scm->fp->fp[i]);
> -   return max_level;
> +   return 0;
>  }
>
>  static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool 
> send_fds)
> @@ -1649,7 +1636,6 @@ static int unix_dgram_sendmsg(struct socket *sock, 
> struct msghdr *msg,
> struct sk_buff *skb;
> long timeo;
> struct scm_cookie scm;
> -   int max_level;
> int data_len = 0;
> int sk_locked;
>
> @@ -1701,7 +1687,6 @@ static int unix_dgram_sendmsg(struct socket *sock, 
> struct msghdr *msg,
> err = unix_scm_to_skb(, skb, true);
> if (err < 0)
> goto out_free;
> -   max_level = err + 1

Re: [PATCH] net/unix: drop obsolete fd-recursion limits

2017-07-17 Thread Tom Gundersen

On Mon, Jul 17, 2017 at 11:35 AM, David Herrmann  wrote:
> All unix sockets now account inflight FDs to the respective sender.
> This was introduced in:
>
> commit 712f4aad406bb1ed67f3f98d04c044191f0ff593
> Author: willy tarreau 
> Date:   Sun Jan 10 07:54:56 2016 +0100
>
> unix: properly account for FDs passed over unix sockets
>
> and further refined in:
>
> commit 415e3d3e90ce9e18727e8843ae343eda5a58fad6
> Author: Hannes Frederic Sowa 
> Date:   Wed Feb 3 02:11:03 2016 +0100
>
> unix: correctly track in-flight fds in sending process user_struct
>
> Hence, regardless of the stacking depth of FDs, the total number of
> inflight FDs is limited, and accounted. There is no known way for a
> local user to exceed those limits or exploit the accounting.
>
> Furthermore, the GC logic is independent of the recursion/stacking depth
> as well. It solely depends on the total number of inflight FDs,
> regardless of their layout.
>
> Lastly, the current `recursion_level' suffers a TOCTOU race, since it
> checks and inherits depths only at queue time. If we consider `A<-B' to
> mean `queue-B-on-A', the following sequence circumvents the recursion
> level easily:
>
> A<-B
>B<-C
>   C<-D
>  ...
>Y<-Z
>
> resulting in:
>
> A<-B<-C<-...<-Z
>
> With all of this in mind, lets drop the recursion limit. It has no
> additional security value, anymore. On the contrary, it randomly
> confuses message brokers that try to forward file-descriptors, since
> any sendmsg(2) call can fail spuriously with ETOOMANYREFS if a client
> maliciously modifies the FD while inflight.
>
> Cc: Alban Crequy 
> Cc: Simon McVittie 
> Signed-off-by: David Herrmann 
Reviewed-by: Tom Gundersen 
> ---
>  include/net/af_unix.h |  1 -
>  net/unix/af_unix.c| 24 +---
>  2 files changed, 1 insertion(+), 24 deletions(-)
>
> diff --git a/include/net/af_unix.h b/include/net/af_unix.h
> index 678e4d6fa317..3b3194b2fc65 100644
> --- a/include/net/af_unix.h
> +++ b/include/net/af_unix.h
> @@ -58,7 +58,6 @@ struct unix_sock {
> struct list_headlink;
> atomic_long_t   inflight;
> spinlock_t  lock;
> -   unsigned char   recursion_level;
> unsigned long   gc_flags;
>  #define UNIX_GC_CANDIDATE  0
>  #define UNIX_GC_MAYBE_CYCLE1
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 7b52a380d710..5c53f22d62e8 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -1528,26 +1528,13 @@ static inline bool too_many_unix_fds(struct 
> task_struct *p)
> return false;
>  }
>
> -#define MAX_RECURSION_LEVEL 4
> -
>  static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
>  {
> int i;
> -   unsigned char max_level = 0;
>
> if (too_many_unix_fds(current))
> return -ETOOMANYREFS;
>
> -   for (i = scm->fp->count - 1; i >= 0; i--) {
> -   struct sock *sk = unix_get_socket(scm->fp->fp[i]);
> -
> -   if (sk)
> -   max_level = max(max_level,
> -   unix_sk(sk)->recursion_level);
> -   }
> -   if (unlikely(max_level > MAX_RECURSION_LEVEL))
> -   return -ETOOMANYREFS;
> -
> /*
>  * Need to duplicate file references for the sake of garbage
>  * collection.  Otherwise a socket in the fps might become a
> @@ -1559,7 +1546,7 @@ static int unix_attach_fds(struct scm_cookie *scm, 
> struct sk_buff *skb)
>
> for (i = scm->fp->count - 1; i >= 0; i--)
> unix_inflight(scm->fp->user, scm->fp->fp[i]);
> -   return max_level;
> +   return 0;
>  }
>
>  static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool 
> send_fds)
> @@ -1649,7 +1636,6 @@ static int unix_dgram_sendmsg(struct socket *sock, 
> struct msghdr *msg,
> struct sk_buff *skb;
> long timeo;
> struct scm_cookie scm;
> -   int max_level;
> int data_len = 0;
> int sk_locked;
>
> @@ -1701,7 +1687,6 @@ static int unix_dgram_sendmsg(struct socket *sock, 
> struct msghdr *msg,
> err = unix_scm_to_skb(, skb, true);
> if (err < 0)
> goto out_free;
> -   max_level = err + 1;
>
> skb_put(skb, len - data_len);
> skb->data_len = data_len;
> @@ -1819,8 +1804,6 @@ static int unix_dgram_sendmsg(struct socket *sock, 
> struct msghdr *msg,
>

Re: [PATCH 2/2] x86/sysfb: fix lfb_size calculation

2016-11-15 Thread Tom Gundersen

On Tue, Nov 15, 2016 at 1:01 PM, David Herrmann <dh.herrm...@gmail.com> wrote:
> The screen_info.lfb_size field is shifted by 16 bits *only* in case of
> VBE. This has historical reasons since VBE advertised it similarly.
> However, in case of EFI framebuffers, the size is no longer shifted. Fix
> the x86 simple-framebuffer setup code to use the correct size in the
> non-VBE case.
>
> While at it, avoid variable abbreviations and rename 'len' to 'length',
> and use the correct types matching the screen_info definition.
>
> Signed-off-by: David Herrmann <dh.herrm...@gmail.com>

Reviewed-by: Tom Gundersen <t...@jklm.no>

> ---
>  arch/x86/kernel/sysfb_simplefb.c | 25 +
>  1 file changed, 17 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/kernel/sysfb_simplefb.c 
> b/arch/x86/kernel/sysfb_simplefb.c
> index 35b8641..85195d4 100644
> --- a/arch/x86/kernel/sysfb_simplefb.c
> +++ b/arch/x86/kernel/sysfb_simplefb.c
> @@ -66,8 +66,8 @@ __init int create_simplefb(const struct screen_info *si,
>  {
> struct platform_device *pd;
> struct resource res;
> -   unsigned long len;
> -   u64 base;
> +   u64 base, size;
> +   u32 length;
>
> /*
>  * If the 64BIT_BASE capability is set, ext_lfb_base will contain the
> @@ -82,11 +82,20 @@ __init int create_simplefb(const struct screen_info *si,
> return -EINVAL;
> }
>
> -   /* don't use lfb_size as it may contain the whole VMEM instead of only
> -* the part that is occupied by the framebuffer */
> -   len = mode->height * mode->stride;
> -   len = PAGE_ALIGN(len);
> -   if (len > (u64)si->lfb_size << 16) {
> +   /*
> +* Don't use lfb_size as IORESOURCE size, since it may contain the
> +* entire VMEM, and thus require huge mappings. Use just the part we
> +* need, that is, the part where the framebuffer is located. But 
> verify
> +* that it does not exceed the advertised VMEM.
> +* Note that in case of VBE, the lfb_size is shifted by 16 bits for
> +* historical reasons.
> +*/
> +   size = si->lfb_size;
> +   if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
> +   size <<= 16;
> +   length = mode->height * mode->stride;
> +   length = PAGE_ALIGN(length);
> +   if (length > size) {
> printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
> return -EINVAL;
> }
> @@ -96,7 +105,7 @@ __init int create_simplefb(const struct screen_info *si,
> res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> res.name = simplefb_resname;
> res.start = base;
> -   res.end = res.start + len - 1;
> +   res.end = res.start + length - 1;
> if (res.end <= res.start)
> return -EINVAL;
>
> --
> 2.10.2
>

Re: [PATCH 2/2] x86/sysfb: fix lfb_size calculation

2016-11-15 Thread Tom Gundersen

On Tue, Nov 15, 2016 at 1:01 PM, David Herrmann  wrote:
> The screen_info.lfb_size field is shifted by 16 bits *only* in case of
> VBE. This has historical reasons since VBE advertised it similarly.
> However, in case of EFI framebuffers, the size is no longer shifted. Fix
> the x86 simple-framebuffer setup code to use the correct size in the
> non-VBE case.
>
> While at it, avoid variable abbreviations and rename 'len' to 'length',
> and use the correct types matching the screen_info definition.
>
> Signed-off-by: David Herrmann 

Reviewed-by: Tom Gundersen 

> ---
>  arch/x86/kernel/sysfb_simplefb.c | 25 +
>  1 file changed, 17 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/kernel/sysfb_simplefb.c 
> b/arch/x86/kernel/sysfb_simplefb.c
> index 35b8641..85195d4 100644
> --- a/arch/x86/kernel/sysfb_simplefb.c
> +++ b/arch/x86/kernel/sysfb_simplefb.c
> @@ -66,8 +66,8 @@ __init int create_simplefb(const struct screen_info *si,
>  {
> struct platform_device *pd;
> struct resource res;
> -   unsigned long len;
> -   u64 base;
> +   u64 base, size;
> +   u32 length;
>
> /*
>  * If the 64BIT_BASE capability is set, ext_lfb_base will contain the
> @@ -82,11 +82,20 @@ __init int create_simplefb(const struct screen_info *si,
> return -EINVAL;
> }
>
> -   /* don't use lfb_size as it may contain the whole VMEM instead of only
> -* the part that is occupied by the framebuffer */
> -   len = mode->height * mode->stride;
> -   len = PAGE_ALIGN(len);
> -   if (len > (u64)si->lfb_size << 16) {
> +   /*
> +* Don't use lfb_size as IORESOURCE size, since it may contain the
> +* entire VMEM, and thus require huge mappings. Use just the part we
> +* need, that is, the part where the framebuffer is located. But 
> verify
> +* that it does not exceed the advertised VMEM.
> +* Note that in case of VBE, the lfb_size is shifted by 16 bits for
> +* historical reasons.
> +*/
> +   size = si->lfb_size;
> +   if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
> +   size <<= 16;
> +   length = mode->height * mode->stride;
> +   length = PAGE_ALIGN(length);
> +   if (length > size) {
> printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
> return -EINVAL;
> }
> @@ -96,7 +105,7 @@ __init int create_simplefb(const struct screen_info *si,
> res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> res.name = simplefb_resname;
> res.start = base;
> -   res.end = res.start + len - 1;
> +   res.end = res.start + length - 1;
> if (res.end <= res.start)
> return -EINVAL;
>
> --
> 2.10.2
>

Re: [PATCH 1/2] x86/sysfb: add support for 64bit EFI lfb_base

2016-11-15 Thread Tom Gundersen

On Tue, Nov 15, 2016 at 1:01 PM, David Herrmann <dh.herrm...@gmail.com> wrote:
> The screen_info object was extended to support 64bit lfb_base addresses
> in:
>
> commit ae2ee627dc87a70910de91b791b3cd0e9c6facdd
> Author: Matt Fleming <matt.flem...@intel.com>
> Date:   Tue Aug 25 16:32:55 2015 +0100
>
> efifb: Add support for 64-bit frame buffer addresses
>
> However, the x86 simple-framebuffer setup code never made use of it. Fix
> it to properly assemble and verify the lfb_base before advertising
> simple-framebuffer devices.
>
> In particular, this means if VIDEO_CAPABILITY_64BIT_BASE is set, the
> screen_info->ext_lfb_base field will contain the upper 32bit of the
> actual lfb_base. Make sure the address is not 0 (i.e., unset), as well as
> does not overflow the physical address type.
>
> Signed-off-by: David Herrmann <dh.herrm...@gmail.com>

Reviewed-by: Tom Gundersen <t...@jklm.no>

> ---
>  arch/x86/kernel/sysfb_simplefb.c | 18 --
>  1 file changed, 16 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kernel/sysfb_simplefb.c 
> b/arch/x86/kernel/sysfb_simplefb.c
> index 764a29f..35b8641 100644
> --- a/arch/x86/kernel/sysfb_simplefb.c
> +++ b/arch/x86/kernel/sysfb_simplefb.c
> @@ -67,6 +67,20 @@ __init int create_simplefb(const struct screen_info *si,
> struct platform_device *pd;
> struct resource res;
> unsigned long len;
> +   u64 base;
> +
> +   /*
> +* If the 64BIT_BASE capability is set, ext_lfb_base will contain the
> +* upper half of the base address. Assemble the address, then make 
> sure
> +* it is valid and we can actually access it.
> +*/
> +   base = si->lfb_base;
> +   if (si->capabilities & VIDEO_CAPABILITY_64BIT_BASE)
> +   base |= (u64)si->ext_lfb_base << 32;
> +   if (!base || (u64)(resource_size_t)base != base) {
> +   printk(KERN_DEBUG "sysfb: inaccessible VRAM base\n");
> +   return -EINVAL;
> +   }
>
> /* don't use lfb_size as it may contain the whole VMEM instead of only
>  * the part that is occupied by the framebuffer */
> @@ -81,8 +95,8 @@ __init int create_simplefb(const struct screen_info *si,
> memset(, 0, sizeof(res));
> res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> res.name = simplefb_resname;
> -   res.start = si->lfb_base;
> -   res.end = si->lfb_base + len - 1;
> +   res.start = base;
> +   res.end = res.start + len - 1;
> if (res.end <= res.start)
> return -EINVAL;
>
> --
> 2.10.2
>

Re: [PATCH 1/2] x86/sysfb: add support for 64bit EFI lfb_base

2016-11-15 Thread Tom Gundersen

On Tue, Nov 15, 2016 at 1:01 PM, David Herrmann  wrote:
> The screen_info object was extended to support 64bit lfb_base addresses
> in:
>
> commit ae2ee627dc87a70910de91b791b3cd0e9c6facdd
> Author: Matt Fleming 
> Date:   Tue Aug 25 16:32:55 2015 +0100
>
> efifb: Add support for 64-bit frame buffer addresses
>
> However, the x86 simple-framebuffer setup code never made use of it. Fix
> it to properly assemble and verify the lfb_base before advertising
> simple-framebuffer devices.
>
> In particular, this means if VIDEO_CAPABILITY_64BIT_BASE is set, the
> screen_info->ext_lfb_base field will contain the upper 32bit of the
> actual lfb_base. Make sure the address is not 0 (i.e., unset), as well as
> does not overflow the physical address type.
>
> Signed-off-by: David Herrmann 

Reviewed-by: Tom Gundersen 

> ---
>  arch/x86/kernel/sysfb_simplefb.c | 18 --
>  1 file changed, 16 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kernel/sysfb_simplefb.c 
> b/arch/x86/kernel/sysfb_simplefb.c
> index 764a29f..35b8641 100644
> --- a/arch/x86/kernel/sysfb_simplefb.c
> +++ b/arch/x86/kernel/sysfb_simplefb.c
> @@ -67,6 +67,20 @@ __init int create_simplefb(const struct screen_info *si,
> struct platform_device *pd;
> struct resource res;
> unsigned long len;
> +   u64 base;
> +
> +   /*
> +* If the 64BIT_BASE capability is set, ext_lfb_base will contain the
> +* upper half of the base address. Assemble the address, then make 
> sure
> +* it is valid and we can actually access it.
> +*/
> +   base = si->lfb_base;
> +   if (si->capabilities & VIDEO_CAPABILITY_64BIT_BASE)
> +   base |= (u64)si->ext_lfb_base << 32;
> +   if (!base || (u64)(resource_size_t)base != base) {
> +   printk(KERN_DEBUG "sysfb: inaccessible VRAM base\n");
> +   return -EINVAL;
> +   }
>
> /* don't use lfb_size as it may contain the whole VMEM instead of only
>  * the part that is occupied by the framebuffer */
> @@ -81,8 +95,8 @@ __init int create_simplefb(const struct screen_info *si,
> memset(, 0, sizeof(res));
> res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> res.name = simplefb_resname;
> -   res.start = si->lfb_base;
> -   res.end = si->lfb_base + len - 1;
> +   res.start = base;
> +   res.end = res.start + len - 1;
> if (res.end <= res.start)
> return -EINVAL;
>
> --
> 2.10.2
>

Re: [RFC v1 06/14] bus1: util - queue utility library

2016-10-28 Thread Tom Gundersen

On Fri, Oct 28, 2016 at 3:58 PM, Peter Zijlstra <pet...@infradead.org> wrote:
> On Fri, Oct 28, 2016 at 03:47:58PM +0200, Tom Gundersen wrote:
>> On Fri, Oct 28, 2016 at 3:33 PM, Peter Zijlstra <pet...@infradead.org> wrote:
>> > On Fri, Oct 28, 2016 at 01:33:25PM +0200, Tom Gundersen wrote:
>
>> > And this, precisely, is what generates all the complexity found in this
>> > patch.  You want to strictly provide more than causality, which does
>> > not, as per the argument above, provide this at all.
>> >
>> > You're providing a semi-global ordering of things that are themselves
>> > not actually ordered.
>>
>> We are providing two things: causality (as in your physics example
>> above), and consistency (which, I agree, is cute, but not necessarily
>> crucial). However, the complexity comes from causality. Consistency is
>> trivial. The only thing needed for consistency is to tag each message
>> by its sender and use this to resolve conflicts in the ordering. The
>> alternative would be to just let these entries order arbitrarily
>> instead, but conceptually it would not be simpler and it would only
>> save us a few lines of code.
>
> Earlier you wrote:
>
>> >> To make this work with multicast, we must stage messages first, then
>> >> commit on a second round. That is, we must find some way to iterate
>> >> over all clocks before committing, but at the same time preventing any
>> >> races. The multicast-stability as you just described we get for free
>> >> by introducing the second-level ordering via sender-address.
>
> But you don't need the two-pass thing at all for causality. The entire
> two-pass thing, and the serialization, is part of the consistency thing.
>
> This is not virtually free.
>
> For causality, all you need is a single iteration, delivering the
> message one after the other, only ever doing local clock movements. You
> do not need to find the max clock in the multicast set and avoid races
> etc..

Ah, I see, we are talking past each other. The property we do want
(which is not trivial) is that we do not want to observe the effect
before the cause. If an event at A causes an event at B, then the two
events should be guaranteed to be observed at C in that order. i.e.,
if you send a multi-cast message from A to B and C and as a result of
receiving the message, B sends a message to C, we want to be
guaranteed that C receives the latter after the former.

If this property is not wanted, then (repeated) unicast can in most
cases be used instead of multi-cast (and a natural optimization, which
we left out for now, would be to skip the staging round for unicast
messages).

Cheers,

Tom

Re: [RFC v1 06/14] bus1: util - queue utility library

2016-10-28 Thread Tom Gundersen

On Fri, Oct 28, 2016 at 3:58 PM, Peter Zijlstra  wrote:
> On Fri, Oct 28, 2016 at 03:47:58PM +0200, Tom Gundersen wrote:
>> On Fri, Oct 28, 2016 at 3:33 PM, Peter Zijlstra  wrote:
>> > On Fri, Oct 28, 2016 at 01:33:25PM +0200, Tom Gundersen wrote:
>
>> > And this, precisely, is what generates all the complexity found in this
>> > patch.  You want to strictly provide more than causality, which does
>> > not, as per the argument above, provide this at all.
>> >
>> > You're providing a semi-global ordering of things that are themselves
>> > not actually ordered.
>>
>> We are providing two things: causality (as in your physics example
>> above), and consistency (which, I agree, is cute, but not necessarily
>> crucial). However, the complexity comes from causality. Consistency is
>> trivial. The only thing needed for consistency is to tag each message
>> by its sender and use this to resolve conflicts in the ordering. The
>> alternative would be to just let these entries order arbitrarily
>> instead, but conceptually it would not be simpler and it would only
>> save us a few lines of code.
>
> Earlier you wrote:
>
>> >> To make this work with multicast, we must stage messages first, then
>> >> commit on a second round. That is, we must find some way to iterate
>> >> over all clocks before committing, but at the same time preventing any
>> >> races. The multicast-stability as you just described we get for free
>> >> by introducing the second-level ordering via sender-address.
>
> But you don't need the two-pass thing at all for causality. The entire
> two-pass thing, and the serialization, is part of the consistency thing.
>
> This is not virtually free.
>
> For causality, all you need is a single iteration, delivering the
> message one after the other, only ever doing local clock movements. You
> do not need to find the max clock in the multicast set and avoid races
> etc..

Ah, I see, we are talking past each other. The property we do want
(which is not trivial) is that we do not want to observe the effect
before the cause. If an event at A causes an event at B, then the two
events should be guaranteed to be observed at C in that order. i.e.,
if you send a multi-cast message from A to B and C and as a result of
receiving the message, B sends a message to C, we want to be
guaranteed that C receives the latter after the former.

If this property is not wanted, then (repeated) unicast can in most
cases be used instead of multi-cast (and a natural optimization, which
we left out for now, would be to skip the staging round for unicast
messages).

Cheers,

Tom

Re: [RFC v1 06/14] bus1: util - queue utility library

2016-10-28 Thread Tom Gundersen

On Fri, Oct 28, 2016 at 3:33 PM, Peter Zijlstra <pet...@infradead.org> wrote:
> On Fri, Oct 28, 2016 at 01:33:25PM +0200, Tom Gundersen wrote:
>> On Thu, Oct 27, 2016 at 6:43 PM, Peter Zijlstra <pet...@infradead.org> wrote:
>> > On Wed, Oct 26, 2016 at 09:18:02PM +0200, David Herrmann wrote:
>> >
>> >> A bus1 message queue is a FIFO, i.e., messages are linearly ordered by
>> >> the time they were sent. Moreover, atomic delivery of messages to
>> >> multiple queues are supported, without any global synchronization, i.e.,
>> >> the order of message delivery is consistent across queues.
>> >>
>> >> Messages can be destined for multiple queues, hence, we need to be
>> >> careful that all queues get a consistent order of incoming messages.
>> >
>> > So I read that to mean that if A and B both send a multi-cast message to
>> > C and D, the messages will appear in the same order for both C and D.
>>
>> That is one of the ordering guarantees, yes.
>>
>> > Why is this important? It seem that this multi-cast ordering generates
>> > much of the complexity of this patch while this Changelog fails to
>> > explain why this is a desired property.
>>
>> I don't think this is the case. The most important guarantee we give
>> is causal ordering.
>
> C and D not observing the message in the same order is consistent with
> causality (and actual physics). The cause is A sending something the
> effect is C receiving something. These two events must be ordered (which
> yields the partial order). But there is no guarantee that different
> observers would observe the same order. Esp. since A and B do not share
> a clock and these events are not in fact ordered themselves.
>
> When we go back to the example of special relativity, as per the paper,
> this is trivially observable if we put A and C together in a frame of
> reference and B and D in a different frame and have the two frames move
> (at a significant fraction of the speed of light) relative to one
> another. The signal, being an emission of light, would not arrive at
> both observers in the same order (if the signal was given sufficiently
> 'simultaneous')
>
>> To make this work with multicast, we must stage messages first, then
>> commit on a second round. That is, we must find some way to iterate
>> over all clocks before committing, but at the same time preventing any
>> races. The multicast-stability as you just described we get for free
>> by introducing the second-level ordering via sender-address.
>
> And this, precisely, is what generates all the complexity found in this
> patch.  You want to strictly provide more than causality, which does
> not, as per the argument above, provide this at all.
>
> You're providing a semi-global ordering of things that are themselves
> not actually ordered.

We are providing two things: causality (as in your physics example
above), and consistency (which, I agree, is cute, but not necessarily
crucial). However, the complexity comes from causality. Consistency is
trivial. The only thing needed for consistency is to tag each message
by its sender and use this to resolve conflicts in the ordering. The
alternative would be to just let these entries order arbitrarily
instead, but conceptually it would not be simpler and it would only
save us a few lines of code.

>> Stability in multicasts without causal order is not necessarily a crucial
>> feature. However, note that if this ordering is given, it allows reducing
>> the number of round-trips in dependent systems. Imagine a daemon
>> reacting to a set of events from different sources. If the actions of that
>> daemon are solely defined by incoming events, someone else can
>> deduce the actions the daemon took without requiring the daemon to
>> send out events by itself. That is, you can just watch the events on the
>> system, and validly deduce the state of such daemon.
>>
>> Example: There is a configuration daemon that sends events when
>> configuration is changed. And there is a hotplug daemon that sends
>> events when devices are hotplugged. You get an event that the "default
>> mute-state" for audio devices was changed, after it you get a
>> hotplugged audio device. You can now rely on the audio daemon to get
>> the events in the same order, and hence apply the new "default
>> mute-state" to the new device. No need to query the audio daemon
>> whether the new device is muted.
>
> Which is cute; but is it worth the pain?
>
>> But as I said, the causal ordering is what we really want.
>> Multicast-stability is just a nice side-effect.
>
> I'm saying they're not the same thing and multi-cast stability isn't at
> all implied.

Yeah, we agree. These are orthogonal concepts. What I meant is that
once we have causality, getting consistency as a side-effect is
virtually free.

Re: [RFC v1 06/14] bus1: util - queue utility library

2016-10-28 Thread Tom Gundersen

On Fri, Oct 28, 2016 at 3:33 PM, Peter Zijlstra  wrote:
> On Fri, Oct 28, 2016 at 01:33:25PM +0200, Tom Gundersen wrote:
>> On Thu, Oct 27, 2016 at 6:43 PM, Peter Zijlstra  wrote:
>> > On Wed, Oct 26, 2016 at 09:18:02PM +0200, David Herrmann wrote:
>> >
>> >> A bus1 message queue is a FIFO, i.e., messages are linearly ordered by
>> >> the time they were sent. Moreover, atomic delivery of messages to
>> >> multiple queues are supported, without any global synchronization, i.e.,
>> >> the order of message delivery is consistent across queues.
>> >>
>> >> Messages can be destined for multiple queues, hence, we need to be
>> >> careful that all queues get a consistent order of incoming messages.
>> >
>> > So I read that to mean that if A and B both send a multi-cast message to
>> > C and D, the messages will appear in the same order for both C and D.
>>
>> That is one of the ordering guarantees, yes.
>>
>> > Why is this important? It seem that this multi-cast ordering generates
>> > much of the complexity of this patch while this Changelog fails to
>> > explain why this is a desired property.
>>
>> I don't think this is the case. The most important guarantee we give
>> is causal ordering.
>
> C and D not observing the message in the same order is consistent with
> causality (and actual physics). The cause is A sending something the
> effect is C receiving something. These two events must be ordered (which
> yields the partial order). But there is no guarantee that different
> observers would observe the same order. Esp. since A and B do not share
> a clock and these events are not in fact ordered themselves.
>
> When we go back to the example of special relativity, as per the paper,
> this is trivially observable if we put A and C together in a frame of
> reference and B and D in a different frame and have the two frames move
> (at a significant fraction of the speed of light) relative to one
> another. The signal, being an emission of light, would not arrive at
> both observers in the same order (if the signal was given sufficiently
> 'simultaneous')
>
>> To make this work with multicast, we must stage messages first, then
>> commit on a second round. That is, we must find some way to iterate
>> over all clocks before committing, but at the same time preventing any
>> races. The multicast-stability as you just described we get for free
>> by introducing the second-level ordering via sender-address.
>
> And this, precisely, is what generates all the complexity found in this
> patch.  You want to strictly provide more than causality, which does
> not, as per the argument above, provide this at all.
>
> You're providing a semi-global ordering of things that are themselves
> not actually ordered.

We are providing two things: causality (as in your physics example
above), and consistency (which, I agree, is cute, but not necessarily
crucial). However, the complexity comes from causality. Consistency is
trivial. The only thing needed for consistency is to tag each message
by its sender and use this to resolve conflicts in the ordering. The
alternative would be to just let these entries order arbitrarily
instead, but conceptually it would not be simpler and it would only
save us a few lines of code.

>> Stability in multicasts without causal order is not necessarily a crucial
>> feature. However, note that if this ordering is given, it allows reducing
>> the number of round-trips in dependent systems. Imagine a daemon
>> reacting to a set of events from different sources. If the actions of that
>> daemon are solely defined by incoming events, someone else can
>> deduce the actions the daemon took without requiring the daemon to
>> send out events by itself. That is, you can just watch the events on the
>> system, and validly deduce the state of such daemon.
>>
>> Example: There is a configuration daemon that sends events when
>> configuration is changed. And there is a hotplug daemon that sends
>> events when devices are hotplugged. You get an event that the "default
>> mute-state" for audio devices was changed, after it you get a
>> hotplugged audio device. You can now rely on the audio daemon to get
>> the events in the same order, and hence apply the new "default
>> mute-state" to the new device. No need to query the audio daemon
>> whether the new device is muted.
>
> Which is cute; but is it worth the pain?
>
>> But as I said, the causal ordering is what we really want.
>> Multicast-stability is just a nice side-effect.
>
> I'm saying they're not the same thing and multi-cast stability isn't at
> all implied.

Yeah, we agree. These are orthogonal concepts. What I meant is that
once we have causality, getting consistency as a side-effect is
virtually free.

Re: [RFC v1 00/14] Bus1 Kernel Message Bus

2016-10-28 Thread Tom Gundersen

On Fri, Oct 28, 2016 at 3:11 PM, Richard Weinberger
 wrote:
> On Wed, Oct 26, 2016 at 9:17 PM, David Herrmann  wrote:
>> Hi
>>
>> This proposal introduces bus1.ko, a kernel messaging bus. This is not a 
>> request
>> for inclusion, yet. It is rather an initial draft and a Request For Comments.
>>
>> While bus1 emerged out of the kdbus project, bus1 was started from scratch 
>> and
>> the concepts have little in common. In a nutshell, bus1 provides a
>> capability-based IPC system, similar in nature to Android Binder, Cap'n 
>> Proto,
>> and seL4. The module is completely generic and does neither require nor 
>> mandate
>> a user-space counter-part.
>
> One thing which is not so clear to me is the role of bus1 wrt. containers.
> Can a container A exchange messages with a container B?
> If not, where is the boundary? I guess it is the pid namespace.

There is no restriction with respect to containers. The metadata is
translated between namespaces, obviously, but you can send messages to
anyone you have a handle to.

Cheers,

Tom

Re: [RFC v1 00/14] Bus1 Kernel Message Bus

2016-10-28 Thread Tom Gundersen

On Fri, Oct 28, 2016 at 3:11 PM, Richard Weinberger
 wrote:
> On Wed, Oct 26, 2016 at 9:17 PM, David Herrmann  wrote:
>> Hi
>>
>> This proposal introduces bus1.ko, a kernel messaging bus. This is not a 
>> request
>> for inclusion, yet. It is rather an initial draft and a Request For Comments.
>>
>> While bus1 emerged out of the kdbus project, bus1 was started from scratch 
>> and
>> the concepts have little in common. In a nutshell, bus1 provides a
>> capability-based IPC system, similar in nature to Android Binder, Cap'n 
>> Proto,
>> and seL4. The module is completely generic and does neither require nor 
>> mandate
>> a user-space counter-part.
>
> One thing which is not so clear to me is the role of bus1 wrt. containers.
> Can a container A exchange messages with a container B?
> If not, where is the boundary? I guess it is the pid namespace.

There is no restriction with respect to containers. The metadata is
translated between namespaces, obviously, but you can send messages to
anyone you have a handle to.

Cheers,

Tom

Re: [RFC v1 08/14] bus1: implement peer management context

2016-10-28 Thread Tom Gundersen

On Fri, Oct 28, 2016 at 3:05 PM, Richard Weinberger
 wrote:
> On Wed, Oct 26, 2016 at 9:18 PM, David Herrmann  wrote:
>> +   /* initialize constant fields */
>> +   peer->id = atomic64_inc_return(_ids);
>> +   peer->flags = 0;
>> +   peer->cred = get_cred(current_cred());
>> +   peer->pid_ns = get_pid_ns(task_active_pid_ns(current));
>> +   peer->user = user;
>> +   peer->debugdir = NULL;
>> +   init_waitqueue_head(>waitq);
>> +   bus1_active_init(>active);
>> +
>> +   /* initialize data section */
>> +   mutex_init(>data.lock);
>> +
>> +   /* initialize peer-private section */
>> +   mutex_init(>local.lock);
>> +
>> +   if (!IS_ERR_OR_NULL(bus1_debugdir)) {
>
> How can bus1_debugdir contain an error code? AFACT it is either a
> valid dentry or NULL.

If debugfs is not enabled it will be ERR_PTR(-ENODEV).

>> +   char idstr[22];
>> +
>> +   snprintf(idstr, sizeof(idstr), "peer-%llx", peer->id);
>> +
>> +   peer->debugdir = debugfs_create_dir(idstr, bus1_debugdir);
>> +   if (!peer->debugdir) {
>> +   pr_err("cannot create debugfs dir for peer %llx\n",
>> +  peer->id);
>> +   } else if (!IS_ERR_OR_NULL(peer->debugdir)) {
>> +   bus1_debugfs_create_atomic_x("active", S_IRUGO,
>> +peer->debugdir,
>> +>active.count);
>> +   }
>> +   }
>> +
>> +   bus1_active_activate(>active);
>
> This is a no-nop since bus1_active_init() set ->count to BUS1_ACTIVE_NEW.

bus1_active_activate() changes count from BUS1_ACTIVE_NEW to 0.

>> +   return peer;
>> +}
>> +
>> +static int bus1_peer_disconnect(struct bus1_peer *peer)
>> +{
>> +   bus1_active_deactivate(>active);
>> +   bus1_active_drain(>active, >waitq);
>> +
>> +   if (!bus1_active_cleanup(>active, >waitq,
>> +NULL, NULL))
>> +   return -ESHUTDOWN;
>> +
>> +   return 0;
>> +}
>> +
>> +/**
>> + * bus1_peer_free() - destroy peer
>> + * @peer:  peer to destroy, or NULL
>> + *
>> + * Destroy a peer object that was previously allocated via bus1_peer_new().
>> + * This synchronously waits for any outstanding operations on this peer to
>> + * finish, then releases all linked resources and deallocates the peer in an
>> + * rcu-delayed manner.
>> + *
>> + * If NULL is passed, this is a no-op.
>> + *
>> + * Return: NULL is returned.
>
> What about making the function of type void?

We are consistently returning the type being freed so we can do

foo->bar = bar_free(bar);

Just a matter of style though.

>> +struct bus1_peer *bus1_peer_free(struct bus1_peer *peer)
>> +{
>> +   if (!peer)
>> +   return NULL;
>> +
>> +   /* disconnect from environment */
>> +   bus1_peer_disconnect(peer);
>> +
>> +   /* deinitialize peer-private section */
>> +   mutex_destroy(>local.lock);
>> +
>> +   /* deinitialize data section */
>> +   mutex_destroy(>data.lock);
>> +
>> +   /* deinitialize constant fields */
>> +   debugfs_remove_recursive(peer->debugdir);
>> +   bus1_active_deinit(>active);
>> +   peer->user = bus1_user_unref(peer->user);
>> +   put_pid_ns(peer->pid_ns);
>> +   put_cred(peer->cred);
>> +   kfree_rcu(peer, rcu);
>> +
>> +   return NULL;
>> +}
>
> --
> Thanks,
> //richard

Re: [RFC v1 08/14] bus1: implement peer management context

2016-10-28 Thread Tom Gundersen

On Fri, Oct 28, 2016 at 3:05 PM, Richard Weinberger
 wrote:
> On Wed, Oct 26, 2016 at 9:18 PM, David Herrmann  wrote:
>> +   /* initialize constant fields */
>> +   peer->id = atomic64_inc_return(_ids);
>> +   peer->flags = 0;
>> +   peer->cred = get_cred(current_cred());
>> +   peer->pid_ns = get_pid_ns(task_active_pid_ns(current));
>> +   peer->user = user;
>> +   peer->debugdir = NULL;
>> +   init_waitqueue_head(>waitq);
>> +   bus1_active_init(>active);
>> +
>> +   /* initialize data section */
>> +   mutex_init(>data.lock);
>> +
>> +   /* initialize peer-private section */
>> +   mutex_init(>local.lock);
>> +
>> +   if (!IS_ERR_OR_NULL(bus1_debugdir)) {
>
> How can bus1_debugdir contain an error code? AFACT it is either a
> valid dentry or NULL.

If debugfs is not enabled it will be ERR_PTR(-ENODEV).

>> +   char idstr[22];
>> +
>> +   snprintf(idstr, sizeof(idstr), "peer-%llx", peer->id);
>> +
>> +   peer->debugdir = debugfs_create_dir(idstr, bus1_debugdir);
>> +   if (!peer->debugdir) {
>> +   pr_err("cannot create debugfs dir for peer %llx\n",
>> +  peer->id);
>> +   } else if (!IS_ERR_OR_NULL(peer->debugdir)) {
>> +   bus1_debugfs_create_atomic_x("active", S_IRUGO,
>> +peer->debugdir,
>> +>active.count);
>> +   }
>> +   }
>> +
>> +   bus1_active_activate(>active);
>
> This is a no-nop since bus1_active_init() set ->count to BUS1_ACTIVE_NEW.

bus1_active_activate() changes count from BUS1_ACTIVE_NEW to 0.

>> +   return peer;
>> +}
>> +
>> +static int bus1_peer_disconnect(struct bus1_peer *peer)
>> +{
>> +   bus1_active_deactivate(>active);
>> +   bus1_active_drain(>active, >waitq);
>> +
>> +   if (!bus1_active_cleanup(>active, >waitq,
>> +NULL, NULL))
>> +   return -ESHUTDOWN;
>> +
>> +   return 0;
>> +}
>> +
>> +/**
>> + * bus1_peer_free() - destroy peer
>> + * @peer:  peer to destroy, or NULL
>> + *
>> + * Destroy a peer object that was previously allocated via bus1_peer_new().
>> + * This synchronously waits for any outstanding operations on this peer to
>> + * finish, then releases all linked resources and deallocates the peer in an
>> + * rcu-delayed manner.
>> + *
>> + * If NULL is passed, this is a no-op.
>> + *
>> + * Return: NULL is returned.
>
> What about making the function of type void?

We are consistently returning the type being freed so we can do

foo->bar = bar_free(bar);

Just a matter of style though.

>> +struct bus1_peer *bus1_peer_free(struct bus1_peer *peer)
>> +{
>> +   if (!peer)
>> +   return NULL;
>> +
>> +   /* disconnect from environment */
>> +   bus1_peer_disconnect(peer);
>> +
>> +   /* deinitialize peer-private section */
>> +   mutex_destroy(>local.lock);
>> +
>> +   /* deinitialize data section */
>> +   mutex_destroy(>data.lock);
>> +
>> +   /* deinitialize constant fields */
>> +   debugfs_remove_recursive(peer->debugdir);
>> +   bus1_active_deinit(>active);
>> +   peer->user = bus1_user_unref(peer->user);
>> +   put_pid_ns(peer->pid_ns);
>> +   put_cred(peer->cred);
>> +   kfree_rcu(peer, rcu);
>> +
>> +   return NULL;
>> +}
>
> --
> Thanks,
> //richard

Re: [RFC v1 08/14] bus1: implement peer management context

2016-10-28 Thread Tom Gundersen

On Fri, Oct 28, 2016 at 2:06 PM, Richard Weinberger
 wrote:
> David, Tom,
>
> On Wed, Oct 26, 2016 at 9:18 PM, David Herrmann  wrote:
>> +struct bus1_peer *bus1_peer_new(void)
>> +{
>> +   static atomic64_t peer_ids = ATOMIC64_INIT(0);
>> +   const struct cred *cred = current_cred();
>> +   struct bus1_peer *peer;
>> +   struct bus1_user *user;
>> +
>> +   user = bus1_user_ref_by_uid(cred->uid);
>> +   if (IS_ERR(user))
>> +   return ERR_CAST(user);
>> +
>> +   peer = kmalloc(sizeof(*peer), GFP_KERNEL);
>> +   if (!peer) {
>> +   bus1_user_unref(user);
>> +   return ERR_PTR(-ENOMEM);
>> +   }
>> +
>> +   /* initialize constant fields */
>> +   peer->id = atomic64_inc_return(_ids);
>
> What is the purpose of this id? Do other components depend on it
> and are they aware of possible overflows?

The id is used purely to give a name to the peer in debugfs.

> Since it is an 64bit integer overflowing it is hard but not impossible.

Hm, what scenario do you have in mind? I cannot see how this could
happen (short of creating peers in a loop for hundreds of years).

Cheers,

Tom

Re: [RFC v1 08/14] bus1: implement peer management context

2016-10-28 Thread Tom Gundersen

On Fri, Oct 28, 2016 at 2:06 PM, Richard Weinberger
 wrote:
> David, Tom,
>
> On Wed, Oct 26, 2016 at 9:18 PM, David Herrmann  wrote:
>> +struct bus1_peer *bus1_peer_new(void)
>> +{
>> +   static atomic64_t peer_ids = ATOMIC64_INIT(0);
>> +   const struct cred *cred = current_cred();
>> +   struct bus1_peer *peer;
>> +   struct bus1_user *user;
>> +
>> +   user = bus1_user_ref_by_uid(cred->uid);
>> +   if (IS_ERR(user))
>> +   return ERR_CAST(user);
>> +
>> +   peer = kmalloc(sizeof(*peer), GFP_KERNEL);
>> +   if (!peer) {
>> +   bus1_user_unref(user);
>> +   return ERR_PTR(-ENOMEM);
>> +   }
>> +
>> +   /* initialize constant fields */
>> +   peer->id = atomic64_inc_return(_ids);
>
> What is the purpose of this id? Do other components depend on it
> and are they aware of possible overflows?

The id is used purely to give a name to the peer in debugfs.

> Since it is an 64bit integer overflowing it is hard but not impossible.

Hm, what scenario do you have in mind? I cannot see how this could
happen (short of creating peers in a loop for hundreds of years).

Cheers,

Tom

Re: [RFC v1 06/14] bus1: util - queue utility library

2016-10-28 Thread Tom Gundersen

On Thu, Oct 27, 2016 at 6:43 PM, Peter Zijlstra  wrote:
> On Wed, Oct 26, 2016 at 09:18:02PM +0200, David Herrmann wrote:
>
>> A bus1 message queue is a FIFO, i.e., messages are linearly ordered by
>> the time they were sent. Moreover, atomic delivery of messages to
>> multiple queues are supported, without any global synchronization, i.e.,
>> the order of message delivery is consistent across queues.
>>
>> Messages can be destined for multiple queues, hence, we need to be
>> careful that all queues get a consistent order of incoming messages.
>
> So I read that to mean that if A and B both send a multi-cast message to
> C and D, the messages will appear in the same order for both C and D.

That is one of the ordering guarantees, yes.

> Why is this important? It seem that this multi-cast ordering generates
> much of the complexity of this patch while this Changelog fails to
> explain why this is a desired property.

I don't think this is the case. The most important guarantee we give
is causal ordering. To make this work with multicast, we must stage
messages first, then commit on a second round. That is, we must find
some way to iterate over all clocks before committing, but at the same
time preventing any races. The multicast-stability as you just described
we get for free by introducing the second-level ordering via
sender-address.

Stability in multicasts without causal order is not necessarily a crucial
feature. However, note that if this ordering is given, it allows reducing
the number of round-trips in dependent systems. Imagine a daemon
reacting to a set of events from different sources. If the actions of that
daemon are solely defined by incoming events, someone else can
deduce the actions the daemon took without requiring the daemon to
send out events by itself. That is, you can just watch the events on the
system, and validly deduce the state of such daemon.

Example: There is a configuration daemon that sends events when
configuration is changed. And there is a hotplug daemon that sends
events when devices are hotplugged. You get an event that the "default
mute-state" for audio devices was changed, after it you get a
hotplugged audio device. You can now rely on the audio daemon to get
the events in the same order, and hence apply the new "default
mute-state" to the new device. No need to query the audio daemon
whether the new device is muted.

But as I said, the causal ordering is what we really want.
Multicast-stability is just a nice side-effect.

It might also be note mentioning: Both Android Binder and Chromium
Mojo make sure they provide causal ordering, since they run into real
issues. Binder allows placing multiple messages under the same
binder-lock, and Mojo provides Associated Interfaces [1]. DBus makes
sure to provide those ordering guarantees as well.

>> We
>> define the concept of `global order' to provide a basic set of
>> guarantees. This global order is a partial order on the set of all
>> messages. The order is defined as:
>>
>> 1) If a message B was queued *after* a message A, then: A < B
>>
>> 2) If a message B was queued *after* a message A was dequeued,
>>then: A < B
>>
>> 3) If a message B was dequeued *after* a message A on the same queue,
>>then: A < B
>>
>> (Note: Causality is honored. `after' and `before' do not refer to
>>  the same task, nor the same queue, but rather any kind of
>>  synchronization between the two operations.)
>>
>> The queue object implements this global order in a lockless fashion. It
>> solely relies on a distributed clock on each queue. Each message to be
>> sent causes a clock tick on the local clock and on all destination
>> clocks. Furthermore, all clocks are synchronized, meaning they're
>> fast-forwarded in case they're behind the highest of all participating
>> peers. No global state tracking is involved.
>
> Yet the code does compares on more than just timestamps. Why are these
> secondary (and even tertiary) ordering required?

Lamport Timestamps are guaranteed to be unique per-sender, but a receiving
queue can still contain messages with the same timestamp (from different
senders). That is, if two multicasts overlap, they might end up with the same
timestamp, if, and only if, they can have no causal relationship
(i.e., the ioctls
are called concurrently). We want to extend this partial order, though. We
want to provide a stable order in those cases (as described above), so we
need a secondary order (we simply pick the memory address of the sender).
This guarantees that all receivers get the same order of all messages (even
if they have equal timestamps).

Note that equal timestamps only happen if entries block each other.
Hence, we can use the memory address as secondary order, since we know
it is unique in those cases (and cannot be re-used).

Cheers,

Tom

[1] 
https://docs.google.com/document/d/1ENDDzACX4hplfQ8cCHGo_rXd3IHTu5H4hEZ44Cu8KVs

Re: [RFC v1 06/14] bus1: util - queue utility library

2016-10-28 Thread Tom Gundersen

On Thu, Oct 27, 2016 at 6:43 PM, Peter Zijlstra  wrote:
> On Wed, Oct 26, 2016 at 09:18:02PM +0200, David Herrmann wrote:
>
>> A bus1 message queue is a FIFO, i.e., messages are linearly ordered by
>> the time they were sent. Moreover, atomic delivery of messages to
>> multiple queues are supported, without any global synchronization, i.e.,
>> the order of message delivery is consistent across queues.
>>
>> Messages can be destined for multiple queues, hence, we need to be
>> careful that all queues get a consistent order of incoming messages.
>
> So I read that to mean that if A and B both send a multi-cast message to
> C and D, the messages will appear in the same order for both C and D.

That is one of the ordering guarantees, yes.

> Why is this important? It seem that this multi-cast ordering generates
> much of the complexity of this patch while this Changelog fails to
> explain why this is a desired property.

I don't think this is the case. The most important guarantee we give
is causal ordering. To make this work with multicast, we must stage
messages first, then commit on a second round. That is, we must find
some way to iterate over all clocks before committing, but at the same
time preventing any races. The multicast-stability as you just described
we get for free by introducing the second-level ordering via
sender-address.

Stability in multicasts without causal order is not necessarily a crucial
feature. However, note that if this ordering is given, it allows reducing
the number of round-trips in dependent systems. Imagine a daemon
reacting to a set of events from different sources. If the actions of that
daemon are solely defined by incoming events, someone else can
deduce the actions the daemon took without requiring the daemon to
send out events by itself. That is, you can just watch the events on the
system, and validly deduce the state of such daemon.

Example: There is a configuration daemon that sends events when
configuration is changed. And there is a hotplug daemon that sends
events when devices are hotplugged. You get an event that the "default
mute-state" for audio devices was changed, after it you get a
hotplugged audio device. You can now rely on the audio daemon to get
the events in the same order, and hence apply the new "default
mute-state" to the new device. No need to query the audio daemon
whether the new device is muted.

But as I said, the causal ordering is what we really want.
Multicast-stability is just a nice side-effect.

It might also be note mentioning: Both Android Binder and Chromium
Mojo make sure they provide causal ordering, since they run into real
issues. Binder allows placing multiple messages under the same
binder-lock, and Mojo provides Associated Interfaces [1]. DBus makes
sure to provide those ordering guarantees as well.

>> We
>> define the concept of `global order' to provide a basic set of
>> guarantees. This global order is a partial order on the set of all
>> messages. The order is defined as:
>>
>> 1) If a message B was queued *after* a message A, then: A < B
>>
>> 2) If a message B was queued *after* a message A was dequeued,
>>then: A < B
>>
>> 3) If a message B was dequeued *after* a message A on the same queue,
>>then: A < B
>>
>> (Note: Causality is honored. `after' and `before' do not refer to
>>  the same task, nor the same queue, but rather any kind of
>>  synchronization between the two operations.)
>>
>> The queue object implements this global order in a lockless fashion. It
>> solely relies on a distributed clock on each queue. Each message to be
>> sent causes a clock tick on the local clock and on all destination
>> clocks. Furthermore, all clocks are synchronized, meaning they're
>> fast-forwarded in case they're behind the highest of all participating
>> peers. No global state tracking is involved.
>
> Yet the code does compares on more than just timestamps. Why are these
> secondary (and even tertiary) ordering required?

Lamport Timestamps are guaranteed to be unique per-sender, but a receiving
queue can still contain messages with the same timestamp (from different
senders). That is, if two multicasts overlap, they might end up with the same
timestamp, if, and only if, they can have no causal relationship
(i.e., the ioctls
are called concurrently). We want to extend this partial order, though. We
want to provide a stable order in those cases (as described above), so we
need a secondary order (we simply pick the memory address of the sender).
This guarantees that all receivers get the same order of all messages (even
if they have equal timestamps).

Note that equal timestamps only happen if entries block each other.
Hence, we can use the memory address as secondary order, since we know
it is unique in those cases (and cannot be re-used).

Cheers,

Tom

[1] 
https://docs.google.com/document/d/1ENDDzACX4hplfQ8cCHGo_rXd3IHTu5H4hEZ44Cu8KVs

Re: [RFC v1 02/14] bus1: provide stub cdev /dev/bus1

2016-10-27 Thread Tom Gundersen

On Thu, Oct 27, 2016 at 6:37 PM, Linus Torvalds
<torva...@linux-foundation.org> wrote:
> On Thu, Oct 27, 2016 at 8:25 AM, Tom Gundersen <t...@jklm.no> wrote:
>>
>> Could you elaborate on why you think syscalls would be more
>> appropriate than ioctls?
>
> ioctl's tend to be a horrid mess both for things like compat.but also
> for things like system call tracing and filtering (ie BPF).
>
> The compat mess is fixable by making sure you always use 64-bit fields
> rather than pointers everywhere and everything is aligned.

This we do.

> The
> tracing and filtering one not so much.

Got it. Thanks.

Tom

Re: [RFC v1 02/14] bus1: provide stub cdev /dev/bus1

2016-10-27 Thread Tom Gundersen

On Thu, Oct 27, 2016 at 6:37 PM, Linus Torvalds
 wrote:
> On Thu, Oct 27, 2016 at 8:25 AM, Tom Gundersen  wrote:
>>
>> Could you elaborate on why you think syscalls would be more
>> appropriate than ioctls?
>
> ioctl's tend to be a horrid mess both for things like compat.but also
> for things like system call tracing and filtering (ie BPF).
>
> The compat mess is fixable by making sure you always use 64-bit fields
> rather than pointers everywhere and everything is aligned.

This we do.

> The
> tracing and filtering one not so much.

Got it. Thanks.

Tom

Re: [RFC v1 02/14] bus1: provide stub cdev /dev/bus1

2016-10-27 Thread Tom Gundersen

On Thu, Oct 27, 2016 at 11:11 AM, Arnd Bergmann <a...@arndb.de> wrote:
> On Thursday, October 27, 2016 1:54:05 AM CEST Tom Gundersen wrote:
>> On Thu, Oct 27, 2016 at 1:19 AM, Andy Lutomirski <l...@amacapital.net> wrote:
>> > This may have been covered elsewhere, but could this use syscalls instead?
>>
>> Yes, syscalls would work essentially the same. For now, we are using a
>> cdev as it makes it a lot more convenient to develop and test as an
>> out-of-tree module, but that could be changed easily before the final
>> submission, if that's what we want.
>
>
> Generally speaking, I think syscalls would be appropriate here, and put
> bus1 into a similar category as the other ipc interfaces (shm, msg, sem,
> mqueue, ...).

Could you elaborate on why you think syscalls would be more
appropriate than ioctls?

> However, syscall API design is nontrivial, and will require a bit of
> work to come to a set of syscalls that is fairly compact but also
> extensible enough. I think it makes sense to go through the exercise
> of working out what the syscall interface would end up looking like,
> and then make a decision.
>
> There is currently a set of file operations:
>
> @@ -48,7 +90,11 @@ const struct file_operations bus1_fops = {
> .owner  = THIS_MODULE,
> .open   = bus1_fop_open,
> .release= bus1_fop_release,
> +   .poll   = bus1_fop_poll,
> .llseek = noop_llseek,
> +   .mmap   = bus1_fop_mmap,
> +   .unlocked_ioctl = bus1_peer_ioctl,
> +   .compat_ioctl   = bus1_peer_ioctl,
> .show_fdinfo= bus1_fop_show_fdinfo,
>  };
>
> and then another set of ioctls:
>
> +enum {
> +   BUS1_CMD_PEER_DISCONNECT= _IOWR(BUS1_IOCTL_MAGIC, 0x00,
> +   __u64),
> +   BUS1_CMD_PEER_QUERY = _IOWR(BUS1_IOCTL_MAGIC, 0x01,
> +   struct bus1_cmd_peer_reset),
> +   BUS1_CMD_PEER_RESET = _IOWR(BUS1_IOCTL_MAGIC, 0x02,
> +   struct bus1_cmd_peer_reset),
> +   BUS1_CMD_HANDLE_RELEASE = _IOWR(BUS1_IOCTL_MAGIC, 0x10,
> +   __u64),
> +   BUS1_CMD_HANDLE_TRANSFER= _IOWR(BUS1_IOCTL_MAGIC, 0x11,
> +   struct bus1_cmd_handle_transfer),
> +   BUS1_CMD_NODES_DESTROY  = _IOWR(BUS1_IOCTL_MAGIC, 0x20,
> +   struct bus1_cmd_nodes_destroy),
> +   BUS1_CMD_SLICE_RELEASE  = _IOWR(BUS1_IOCTL_MAGIC, 0x30,
> +   __u64),
> +   BUS1_CMD_SEND   = _IOWR(BUS1_IOCTL_MAGIC, 0x40,
> +   struct bus1_cmd_send),
> +   BUS1_CMD_RECV   = _IOWR(BUS1_IOCTL_MAGIC, 0x50,
> +   struct bus1_cmd_recv),
> +};
>
> I think there is no alternative to having some sort of file descriptor
> with the basic operations you have above, but there is a question of
> how to get that file descriptor if the ioctls get changed to a syscall,
> the basic options being:

I could see the point of wanting a syscall to get the fd (your second
option below), but as I said, not sure I see why we would want to use
syscalls instead of ioctls.

> - Keep using a chardev. This works, but feels a little odd to me,
>   and I can't think of any other interfaces combining syscalls with
>   a chardev.
>
> - Have one syscall that returns an open file descriptor, replacing
>   the fops->open() function. One advantage is that you can pass
>   additional arguments in that you can't have with open.
>   An example for this would be mqueue_open().

If we are going to change it, this might makes sense to me. It would
allow you to get the fd without having to have access to some
character device.

> - Have a mountable file system, and use open() on that to create
>   connections. Advantages are that it's fairly easy to have one
>   instance per fs-namespace, and you can have user-defined naming
>   of objects in the file system.

Note that currently we only have one object (/dev/bus1) and each fd is
disconnected from anything else on creation, so not sure what benefits
a filesystem (or several instances of it) would give?

> For the other operations, the obvious translation would be to
> turn each ioctl command into one syscall, but that may not always
> be the best representation. One limitation is that you cannot
> generally have more than six 'long' arguments on a lot of
> architectures, and passing 'u64' arguments to syscalls is awk

Re: [RFC v1 02/14] bus1: provide stub cdev /dev/bus1

2016-10-27 Thread Tom Gundersen

On Thu, Oct 27, 2016 at 11:11 AM, Arnd Bergmann  wrote:
> On Thursday, October 27, 2016 1:54:05 AM CEST Tom Gundersen wrote:
>> On Thu, Oct 27, 2016 at 1:19 AM, Andy Lutomirski  wrote:
>> > This may have been covered elsewhere, but could this use syscalls instead?
>>
>> Yes, syscalls would work essentially the same. For now, we are using a
>> cdev as it makes it a lot more convenient to develop and test as an
>> out-of-tree module, but that could be changed easily before the final
>> submission, if that's what we want.
>
>
> Generally speaking, I think syscalls would be appropriate here, and put
> bus1 into a similar category as the other ipc interfaces (shm, msg, sem,
> mqueue, ...).

Could you elaborate on why you think syscalls would be more
appropriate than ioctls?

> However, syscall API design is nontrivial, and will require a bit of
> work to come to a set of syscalls that is fairly compact but also
> extensible enough. I think it makes sense to go through the exercise
> of working out what the syscall interface would end up looking like,
> and then make a decision.
>
> There is currently a set of file operations:
>
> @@ -48,7 +90,11 @@ const struct file_operations bus1_fops = {
> .owner  = THIS_MODULE,
> .open   = bus1_fop_open,
> .release= bus1_fop_release,
> +   .poll   = bus1_fop_poll,
> .llseek = noop_llseek,
> +   .mmap   = bus1_fop_mmap,
> +   .unlocked_ioctl = bus1_peer_ioctl,
> +   .compat_ioctl   = bus1_peer_ioctl,
> .show_fdinfo= bus1_fop_show_fdinfo,
>  };
>
> and then another set of ioctls:
>
> +enum {
> +   BUS1_CMD_PEER_DISCONNECT= _IOWR(BUS1_IOCTL_MAGIC, 0x00,
> +   __u64),
> +   BUS1_CMD_PEER_QUERY = _IOWR(BUS1_IOCTL_MAGIC, 0x01,
> +   struct bus1_cmd_peer_reset),
> +   BUS1_CMD_PEER_RESET = _IOWR(BUS1_IOCTL_MAGIC, 0x02,
> +   struct bus1_cmd_peer_reset),
> +   BUS1_CMD_HANDLE_RELEASE = _IOWR(BUS1_IOCTL_MAGIC, 0x10,
> +   __u64),
> +   BUS1_CMD_HANDLE_TRANSFER= _IOWR(BUS1_IOCTL_MAGIC, 0x11,
> +   struct bus1_cmd_handle_transfer),
> +   BUS1_CMD_NODES_DESTROY  = _IOWR(BUS1_IOCTL_MAGIC, 0x20,
> +   struct bus1_cmd_nodes_destroy),
> +   BUS1_CMD_SLICE_RELEASE  = _IOWR(BUS1_IOCTL_MAGIC, 0x30,
> +   __u64),
> +   BUS1_CMD_SEND   = _IOWR(BUS1_IOCTL_MAGIC, 0x40,
> +   struct bus1_cmd_send),
> +   BUS1_CMD_RECV   = _IOWR(BUS1_IOCTL_MAGIC, 0x50,
> +   struct bus1_cmd_recv),
> +};
>
> I think there is no alternative to having some sort of file descriptor
> with the basic operations you have above, but there is a question of
> how to get that file descriptor if the ioctls get changed to a syscall,
> the basic options being:

I could see the point of wanting a syscall to get the fd (your second
option below), but as I said, not sure I see why we would want to use
syscalls instead of ioctls.

> - Keep using a chardev. This works, but feels a little odd to me,
>   and I can't think of any other interfaces combining syscalls with
>   a chardev.
>
> - Have one syscall that returns an open file descriptor, replacing
>   the fops->open() function. One advantage is that you can pass
>   additional arguments in that you can't have with open.
>   An example for this would be mqueue_open().

If we are going to change it, this might makes sense to me. It would
allow you to get the fd without having to have access to some
character device.

> - Have a mountable file system, and use open() on that to create
>   connections. Advantages are that it's fairly easy to have one
>   instance per fs-namespace, and you can have user-defined naming
>   of objects in the file system.

Note that currently we only have one object (/dev/bus1) and each fd is
disconnected from anything else on creation, so not sure what benefits
a filesystem (or several instances of it) would give?

> For the other operations, the obvious translation would be to
> turn each ioctl command into one syscall, but that may not always
> be the best representation. One limitation is that you cannot
> generally have more than six 'long' arguments on a lot of
> architectures, and passing 'u64' arguments to syscalls is awkward.
>
> For some of the commands, the t

Re: [RFC v1 02/14] bus1: provide stub cdev /dev/bus1

2016-10-26 Thread Tom Gundersen

On Thu, Oct 27, 2016 at 1:19 AM, Andy Lutomirski  wrote:
> This may have been covered elsewhere, but could this use syscalls instead?

Yes, syscalls would work essentially the same. For now, we are using a
cdev as it makes it a lot more convenient to develop and test as an
out-of-tree module, but that could be changed easily before the final
submission, if that's what we want.

Cheers,

Tom

Re: [RFC v1 02/14] bus1: provide stub cdev /dev/bus1

2016-10-26 Thread Tom Gundersen

On Thu, Oct 27, 2016 at 1:19 AM, Andy Lutomirski  wrote:
> This may have been covered elsewhere, but could this use syscalls instead?

Yes, syscalls would work essentially the same. For now, we are using a
cdev as it makes it a lot more convenient to develop and test as an
out-of-tree module, but that could be changed easily before the final
submission, if that's what we want.

Cheers,

Tom

Re: [PATCH 2/8] driver-core: add asynchronous probing support for drivers

2015-07-07 Thread Tom Gundersen

On Tue, Jul 7, 2015 at 1:23 AM, Luis R. Rodriguez  wrote:
> On Sat, Jul 04, 2015 at 07:09:19AM -0700, Dan Williams wrote:
>> On Fri, Jul 3, 2015 at 11:30 AM, Luis R. Rodriguez  wrote:
>> > On Sat, Jun 27, 2015 at 04:45:25PM -0700, Dan Williams wrote:
>> >> On Mon, Mar 30, 2015 at 4:20 PM, Dmitry Torokhov
>> >>  wrote:
>> >> > Some devices take a long time when initializing, and not all drivers are
>> >> > suited to initialize their devices when they are open. For example,
>> >> > input drivers need to interrogate their devices in order to publish
>> >> > device's capabilities before userspace will open them. When such drivers
>> >> > are compiled into kernel they may stall entire kernel initialization.
>> >> >
>> >> > This change allows drivers request for their probe functions to be
>> >> > called asynchronously during driver and device registration (manual
>> >> > binding is still synchronous). Because async_schedule is used to perform
>> >> > asynchronous calls module loading will still wait for the probing to
>> >> > complete.
>> >> >
>> >> > Note that the end goal is to make the probing asynchronous by default,
>> >> > so annotating drivers with PROBE_PREFER_ASYNCHRONOUS is a temporary
>> >> > measure that allows us to speed up boot process while we validating and
>> >> > fixing the rest of the drivers and preparing userspace.
>> >> >
>> >> > This change is based on earlier patch by "Luis R. Rodriguez"
>> >> > 
>> >> >
>> >> > Signed-off-by: Dmitry Torokhov 
>> >> > ---
>> >> >  drivers/base/base.h|   1 +
>> >> >  drivers/base/bus.c |  31 +++---
>> >> >  drivers/base/dd.c  | 149 
>> >> > ++---
>> >> >  include/linux/device.h |  28 ++
>> >> >  4 files changed, 182 insertions(+), 27 deletions(-)
>> >>
>> >> Just noticed this patch.  It caught my eye because I had a hard time
>> >> getting an open coded implementation of asynchronous probing to work
>> >> in the new libnvdimm subsystem.  Especially the messy races of tearing
>> >> things down while probing is still in flight.  I ended up implementing
>> >> asynchronous device registration which eliminated a lot of complexity
>> >> and of course the bugs.  In general I tend to think that async
>> >> registration is less risky than async probe since it keeps wider
>> >> portions of the traditional device model synchronous
>> >
>> > but its not see -DEFER_PROBE even before async probe.
>>
>> Except in that case you know probe has been seen by the driver at
>> least once.  So I see that as less of a surprise, but point taken.
>>
>> >> and leverages the
>> >> fact that the device model is already well prepared for asynchronous
>> >> arrival of devices due to hotplug.
>> >
>> > I think this sounds reasonable, do you have your code upstream or posted?
>>
>> Yes, see nd_device_register() in drivers/nvdimm/bus.c
>
> It should be I think rather easy for Dmitry to see if he can convert this 
> input
> driver (not yet upstream) to this API and see if the same issues are fixed.
> This however does not address systemd's assumption over detachment of module
> load and probe. The inherent problem there was the timeout implemented and
> carried in systemd over the worker that uses modlib to load modules. Upon
> review the code was complex enough already and surely increasing the timeout
> helps but that doesn't address all issues with a general timeout in place.
> At SUSE we ended up not using a timeout for kmod built-in commands. That
> leaves the original timeout purpose in place. The code for async probe was
> not put in the kernel though but since its now upstream we should be able
> to replace that userspace systemd work around with async probe, but systemd
> folks would need to decide what they want to do. For full gory details of
> this refer to:
>
> https://bugzilla.novell.com/show_bug.cgi?id=889297

FTR, this does not appear to be public, so I was not able to see it.

>> > If not will you be at Plumbers?
>>
>> Yes.
>
> Great, Tom, Dmitry, will you be at Plumbers?

Sadly, I won't make plumbers this year.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/8] driver-core: add asynchronous probing support for drivers

2015-07-07 Thread Tom Gundersen

On Tue, Jul 7, 2015 at 1:23 AM, Luis R. Rodriguez mcg...@suse.com wrote:
 On Sat, Jul 04, 2015 at 07:09:19AM -0700, Dan Williams wrote:
 On Fri, Jul 3, 2015 at 11:30 AM, Luis R. Rodriguez mcg...@suse.com wrote:
  On Sat, Jun 27, 2015 at 04:45:25PM -0700, Dan Williams wrote:
  On Mon, Mar 30, 2015 at 4:20 PM, Dmitry Torokhov
  dmitry.torok...@gmail.com wrote:
   Some devices take a long time when initializing, and not all drivers are
   suited to initialize their devices when they are open. For example,
   input drivers need to interrogate their devices in order to publish
   device's capabilities before userspace will open them. When such drivers
   are compiled into kernel they may stall entire kernel initialization.
  
   This change allows drivers request for their probe functions to be
   called asynchronously during driver and device registration (manual
   binding is still synchronous). Because async_schedule is used to perform
   asynchronous calls module loading will still wait for the probing to
   complete.
  
   Note that the end goal is to make the probing asynchronous by default,
   so annotating drivers with PROBE_PREFER_ASYNCHRONOUS is a temporary
   measure that allows us to speed up boot process while we validating and
   fixing the rest of the drivers and preparing userspace.
  
   This change is based on earlier patch by Luis R. Rodriguez
   mcg...@suse.com
  
   Signed-off-by: Dmitry Torokhov dmitry.torok...@gmail.com
   ---
drivers/base/base.h|   1 +
drivers/base/bus.c |  31 +++---
drivers/base/dd.c  | 149 
   ++---
include/linux/device.h |  28 ++
4 files changed, 182 insertions(+), 27 deletions(-)
 
  Just noticed this patch.  It caught my eye because I had a hard time
  getting an open coded implementation of asynchronous probing to work
  in the new libnvdimm subsystem.  Especially the messy races of tearing
  things down while probing is still in flight.  I ended up implementing
  asynchronous device registration which eliminated a lot of complexity
  and of course the bugs.  In general I tend to think that async
  registration is less risky than async probe since it keeps wider
  portions of the traditional device model synchronous
 
  but its not see -DEFER_PROBE even before async probe.

 Except in that case you know probe has been seen by the driver at
 least once.  So I see that as less of a surprise, but point taken.

  and leverages the
  fact that the device model is already well prepared for asynchronous
  arrival of devices due to hotplug.
 
  I think this sounds reasonable, do you have your code upstream or posted?

 Yes, see nd_device_register() in drivers/nvdimm/bus.c

 It should be I think rather easy for Dmitry to see if he can convert this 
 input
 driver (not yet upstream) to this API and see if the same issues are fixed.
 This however does not address systemd's assumption over detachment of module
 load and probe. The inherent problem there was the timeout implemented and
 carried in systemd over the worker that uses modlib to load modules. Upon
 review the code was complex enough already and surely increasing the timeout
 helps but that doesn't address all issues with a general timeout in place.
 At SUSE we ended up not using a timeout for kmod built-in commands. That
 leaves the original timeout purpose in place. The code for async probe was
 not put in the kernel though but since its now upstream we should be able
 to replace that userspace systemd work around with async probe, but systemd
 folks would need to decide what they want to do. For full gory details of
 this refer to:

 https://bugzilla.novell.com/show_bug.cgi?id=889297

FTR, this does not appear to be public, so I was not able to see it.

  If not will you be at Plumbers?

 Yes.

 Great, Tom, Dmitry, will you be at Plumbers?

Sadly, I won't make plumbers this year.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [!GIT PULL] kdbus for 4.2

2015-06-26 Thread Tom Gundersen

On Fri, Jun 26, 2015 at 9:33 PM, Andy Lutomirski  wrote:
> What's a good distro on which to poke at a full running system?

Fedora Rawhide is probably currently your best bet.

> Fedora Rawhide seems to still build systemd with --disable-kdbus,
> although I suppose that means that I could boot it with kdbus=1?

Correct.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH][RFC] usbhid: enable autosuspend for internal devices

2015-06-26 Thread Tom Gundersen

This policy used to be unconditionally applied by udev, but there
is no reason to make userspace be involved in this and in the future
udev will not be doing it by default.

See: <https://github.com/systemd/systemd/pull/353>.

Signed-off-by: Tom Gundersen 
Cc: Jiri Kosina 
Cc: Greg Kroah-Hartman 
---

Hi,

I don't have the right hardware for this, so it has only been compile-tested.
I'm therefore sending it as an RFC only. Mainly I want to bring it to people's
attention that it would be great to get this feature into the kernel as we want
to drop it from udev.

Cheers,

Tom

 drivers/hid/usbhid/hid-core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index bfbe1be..af80700 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -1358,6 +1358,9 @@ static int usbhid_probe(struct usb_interface *intf, const 
struct usb_device_id *
setup_timer(>io_retry, hid_retry_timeout, (unsigned long) hid);
spin_lock_init(>lock);

+   if (dev->removable == USB_DEVICE_FIXED)
+   usb_enable_autosuspend(dev);
+
ret = hid_add_device(hid);
if (ret) {
if (ret != -ENODEV)
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [!GIT PULL] kdbus for 4.2

2015-06-26 Thread Tom Gundersen

On Fri, Jun 26, 2015 at 9:33 PM, Andy Lutomirski l...@amacapital.net wrote:
 What's a good distro on which to poke at a full running system?

Fedora Rawhide is probably currently your best bet.

 Fedora Rawhide seems to still build systemd with --disable-kdbus,
 although I suppose that means that I could boot it with kdbus=1?

Correct.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH][RFC] usbhid: enable autosuspend for internal devices

2015-06-26 Thread Tom Gundersen

This policy used to be unconditionally applied by udev, but there
is no reason to make userspace be involved in this and in the future
udev will not be doing it by default.

See: https://github.com/systemd/systemd/pull/353.

Signed-off-by: Tom Gundersen t...@jklm.no
Cc: Jiri Kosina jkos...@suse.cz
Cc: Greg Kroah-Hartman gre...@linuxfoundation.org
---

Hi,

I don't have the right hardware for this, so it has only been compile-tested.
I'm therefore sending it as an RFC only. Mainly I want to bring it to people's
attention that it would be great to get this feature into the kernel as we want
to drop it from udev.

Cheers,

Tom

 drivers/hid/usbhid/hid-core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index bfbe1be..af80700 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -1358,6 +1358,9 @@ static int usbhid_probe(struct usb_interface *intf, const 
struct usb_device_id *
setup_timer(usbhid-io_retry, hid_retry_timeout, (unsigned long) hid);
spin_lock_init(usbhid-lock);

+   if (dev-removable == USB_DEVICE_FIXED)
+   usb_enable_autosuspend(dev);
+
ret = hid_add_device(hid);
if (ret) {
if (ret != -ENODEV)
-- 
2.4.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [GIT PULL] kdbus for 4.1-rc1

2015-04-16 Thread Tom Gundersen

On 04/15/2015 10:22 PM, Andy Lutomirski wrote:
> On Wed, Apr 15, 2015 at 9:44 AM, Havoc Pennington  wrote:
>> That is, with dbus if I send a broadcast message, then send a unicast
>> request to another client, then drop the connection causing the bus to
>> broadcast that I've dropped; then the other client will see those
>> things in that order - the broadcast, then the request, and then that
>> I've dropped the connection.
>
> This leads me to a potentially interesting question: where's the
> buffering?  If there's a bus with lots of untrusted clients and one of
> them broadcasts data faster than all receivers can process it, where
> does it go?

The concepts implemented in kdbus are actually quite different from dbus1:

Every connection to the bus has a memory pool assigned to store
incoming messages and variably sized runtime data returned by kdbus.
The pool memory is swappable, backed by a shmem file which is
associated with the bus connection.

Also, broadcasts are opt-in, so you only receive them if you
subscribed for the specific signal. It is either sent by another
userspace task, or by the kernel itself for things like name owner
changes. In order to receive those, a connection must install a match.
By default, no-one will receive any broadcasts.

All types of messages (unicast and broadcast) are directly stored into
a pool slice of the receiving connection, and this slice is not reused
by the kernel until userspace is finished with it and frees it. Hence,
a client which doesn't process its incoming messages will, at some
point, run out of pool space. If that happens for unicast messages,
the sender will get an EXFULL error. If it happens for a multicast
message, all we can do is drop the message, and tell the receiver how
many messages have been lost when it issues KDBUS_CMD_RECV the next
time. There's more on that in kdbus.message(7).

Also note that there is a quota logic in kdbus which protects against
a single connection conducting a DOS against another one. Together
with the policy code, this logic prevents one peer from flooding the
pool of another peer. Communication with a 3rd party is not affected
by this, due to the fair allocation scheme of the pool logic.

All this is explained in detail in kdbus.pool(7), but please let us
know if anything there is unclear.

> At least with a userspace solution, it's clear what the OOM killer
> should kill when this happens.  Unless it's PID 1.  Sigh.

No, if the buffering was done in the sender, the OOM killer would
catch the sending peer, which is of course the wrong thing to do,
because one connection could blow up a task simply by not responding
to the messages it sends. This is the reason why the pool concept was
a design principle in kdbus from the very beginning.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [GIT PULL] kdbus for 4.1-rc1

2015-04-16 Thread Tom Gundersen

On Wed, Apr 15, 2015 at 2:09 PM, Jiri Kosina  wrote:
> On Wed, 15 Apr 2015, Greg Kroah-Hartman wrote:
>
>> 'systemctl reboot' calls a bunch of other things to determine if you
>> have local access to the machine, or permissions to reboot the machine
>> (i.e. CAP_SYS_BOOT), and other things that polkit might allow you to do,
>> and then, it decides to reboot or not.  That happens today, right?  I
>> don't understand the argument here.
>
> And what exactly is the argument that this is the way it should be
> implemnted?
>
> Why can't it just rely on the kernel to provide final answer to "to reboot
> or not to reboot, that is the question"?
>
> At the end of the day, it's the kernel that decides whether it will really
> ultimately ask the platform to reboot.
>
> If, for whatever reason (which might be completely invisible to userspace)
> kernel decides not to do so, userspace has to be able to recover from such
> failure in any case.

This is not how shutting down a general purpose operating system
works. If a system is shut down, all user sessions are terminated, all
services are stopped in the right order, all remaining processes
killed, all file systems are unmounted, all storage devices
disassembled, and so on. All this is implemented entirely in userspace
and involves a number of complex transitions from the normal init
system, to a shutdown PID 1 process and finally a transition back to
the initial ramdisk so that we can unmount the root file system even.
After all that is done, in the right order, following dependencies,
while enforcing timeouts, then the very last step is actually the
reboot() system call that then brings the kernel to a halt, and
possibly turns off power.

Thus I don't see how your suggestion can be applied in any way to how
system shutdown works: the shutdown procedure includes these
non-trivial preparation steps described above, and it is essential
that this preparation is not begun unless the client requesting it
actually has sufficient rights to do so. Or to put this another way:
if the system went all the way down, so that everything is killed,
unmounted, disassembled, to the point even that we transitioned away
from the root file system, then the reboot() system call is really
just the tiniest bit of it. And you should not be able to get there if
you originally didn't even possess the capability to execute that last
step...

Moreover, the daemon performing the shutdown tasks is necessarily
always privileged enough to do so, so calling into the kernel and see
what happens is completely the wrong thing to do (it would simply
succeed). What matters is if the client calling the daemon is
sufficiently privileged. If the client has the capabilites necessary
to call the reboot syscall directly, it makes no sense to disallow
them from doing a clean reboot. It would be like giving someone access
to pull the power plug, but not allow them to shutdown the machine
cleanly.

To conclude, the kernel makes the decision for allowing reboot() to
succeed based on CAP_SYS_BOOT, so when we decide whether or not to
perform the preparation steps, we really must also use CAP_SYS_BOOT.
If we are more restrictive, it does not gain us anything as people
with CAP_SYS_BOOT can just circumvent our logic and "pull the plug" by
calling reboot() directly. If we are less restrictive and for instance
check for uid==0 it would essentially mean that we have added a way to
circumvent the dropping of CAP_SYS_BOOT.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [GIT PULL] kdbus for 4.1-rc1

2015-04-16 Thread Tom Gundersen

On Wed, Apr 15, 2015 at 2:09 PM, Jiri Kosina jkos...@suse.cz wrote:
 On Wed, 15 Apr 2015, Greg Kroah-Hartman wrote:

 'systemctl reboot' calls a bunch of other things to determine if you
 have local access to the machine, or permissions to reboot the machine
 (i.e. CAP_SYS_BOOT), and other things that polkit might allow you to do,
 and then, it decides to reboot or not.  That happens today, right?  I
 don't understand the argument here.

 And what exactly is the argument that this is the way it should be
 implemnted?

 Why can't it just rely on the kernel to provide final answer to to reboot
 or not to reboot, that is the question?

 At the end of the day, it's the kernel that decides whether it will really
 ultimately ask the platform to reboot.

 If, for whatever reason (which might be completely invisible to userspace)
 kernel decides not to do so, userspace has to be able to recover from such
 failure in any case.

This is not how shutting down a general purpose operating system
works. If a system is shut down, all user sessions are terminated, all
services are stopped in the right order, all remaining processes
killed, all file systems are unmounted, all storage devices
disassembled, and so on. All this is implemented entirely in userspace
and involves a number of complex transitions from the normal init
system, to a shutdown PID 1 process and finally a transition back to
the initial ramdisk so that we can unmount the root file system even.
After all that is done, in the right order, following dependencies,
while enforcing timeouts, then the very last step is actually the
reboot() system call that then brings the kernel to a halt, and
possibly turns off power.

Thus I don't see how your suggestion can be applied in any way to how
system shutdown works: the shutdown procedure includes these
non-trivial preparation steps described above, and it is essential
that this preparation is not begun unless the client requesting it
actually has sufficient rights to do so. Or to put this another way:
if the system went all the way down, so that everything is killed,
unmounted, disassembled, to the point even that we transitioned away
from the root file system, then the reboot() system call is really
just the tiniest bit of it. And you should not be able to get there if
you originally didn't even possess the capability to execute that last
step...

Moreover, the daemon performing the shutdown tasks is necessarily
always privileged enough to do so, so calling into the kernel and see
what happens is completely the wrong thing to do (it would simply
succeed). What matters is if the client calling the daemon is
sufficiently privileged. If the client has the capabilites necessary
to call the reboot syscall directly, it makes no sense to disallow
them from doing a clean reboot. It would be like giving someone access
to pull the power plug, but not allow them to shutdown the machine
cleanly.

To conclude, the kernel makes the decision for allowing reboot() to
succeed based on CAP_SYS_BOOT, so when we decide whether or not to
perform the preparation steps, we really must also use CAP_SYS_BOOT.
If we are more restrictive, it does not gain us anything as people
with CAP_SYS_BOOT can just circumvent our logic and pull the plug by
calling reboot() directly. If we are less restrictive and for instance
check for uid==0 it would essentially mean that we have added a way to
circumvent the dropping of CAP_SYS_BOOT.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [GIT PULL] kdbus for 4.1-rc1

2015-04-16 Thread Tom Gundersen

On 04/15/2015 10:22 PM, Andy Lutomirski wrote:
 On Wed, Apr 15, 2015 at 9:44 AM, Havoc Pennington h...@pobox.com wrote:
 That is, with dbus if I send a broadcast message, then send a unicast
 request to another client, then drop the connection causing the bus to
 broadcast that I've dropped; then the other client will see those
 things in that order - the broadcast, then the request, and then that
 I've dropped the connection.

 This leads me to a potentially interesting question: where's the
 buffering?  If there's a bus with lots of untrusted clients and one of
 them broadcasts data faster than all receivers can process it, where
 does it go?

The concepts implemented in kdbus are actually quite different from dbus1:

Every connection to the bus has a memory pool assigned to store
incoming messages and variably sized runtime data returned by kdbus.
The pool memory is swappable, backed by a shmem file which is
associated with the bus connection.

Also, broadcasts are opt-in, so you only receive them if you
subscribed for the specific signal. It is either sent by another
userspace task, or by the kernel itself for things like name owner
changes. In order to receive those, a connection must install a match.
By default, no-one will receive any broadcasts.

All types of messages (unicast and broadcast) are directly stored into
a pool slice of the receiving connection, and this slice is not reused
by the kernel until userspace is finished with it and frees it. Hence,
a client which doesn't process its incoming messages will, at some
point, run out of pool space. If that happens for unicast messages,
the sender will get an EXFULL error. If it happens for a multicast
message, all we can do is drop the message, and tell the receiver how
many messages have been lost when it issues KDBUS_CMD_RECV the next
time. There's more on that in kdbus.message(7).

Also note that there is a quota logic in kdbus which protects against
a single connection conducting a DOS against another one. Together
with the policy code, this logic prevents one peer from flooding the
pool of another peer. Communication with a 3rd party is not affected
by this, due to the fair allocation scheme of the pool logic.

All this is explained in detail in kdbus.pool(7), but please let us
know if anything there is unclear.

 At least with a userspace solution, it's clear what the OOM killer
 should kill when this happens.  Unless it's PID 1.  Sigh.

No, if the buffering was done in the sender, the OOM killer would
catch the sending peer, which is of course the wrong thing to do,
because one connection could blow up a task simply by not responding
to the messages it sends. This is the reason why the pool concept was
a design principle in kdbus from the very beginning.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v4 00/14] Add kdbus implementation

2015-03-31 Thread Tom Gundersen

On Tue, Mar 31, 2015 at 3:58 PM, Andy Lutomirski  wrote:
> On Mon, Mar 30, 2015 at 9:56 AM, David Herrmann  wrote:
>> Hi
>>
>> On Wed, Mar 25, 2015 at 7:12 PM, Andy Lutomirski  wrote:
>>> On Wed, Mar 25, 2015 at 10:29 AM, David Herrmann  
>>> wrote:
>> [...]
> I could be wrong about the lack of use cases.  If so, please enlighten me.

 We have several dbus APIs that allow clients to register as a special
 handler/controller/etc. (eg., see systemd-logind TakeControl()). The
 API provider checks the privileges of a client on registration and
 then just tracks the client ID. This way, the client can be privileged
 when asking for special access, then drop privileges and still use the
 interface. You cannot re-connect in between, as the API provider
 tracks your bus ID. Without message-metadata, all your (other) calls
 on this bus would always be treated as privileged. We *really* want to
 avoid this.
>>>
>>> Connect twice?
>>
>> The remote peer might cache your connection ID to track you. You have
>> to use the same connection to talk to that peer, in this given
>> scenario. That's how D-Bus1 is used today, and we have to follow the
>> semantics.
>
> That's yet another reason that you really ought to disconnect and
> reconnect after a privilege change -- the remote peer might remember
> you.
>
> The "might" will be a big problem.  Users of kdbus can't rely on any
> particular concept of privilege because you have too many of them.
>
>>
>>> You *already* have to reconnect or connect twice because you have
>>> per-connection metadata.  That's part of my problem with this scheme
>>> -- you support *both styles*, which seems like it'll give you most of
>>> the downsides of both without the upsides.
>>
>> Not necessarily. Connection metadata describes the state at the time
>> you connected to the bus. If someone ask for this information, they
>> will get exactly that. In this model, you cannot drop privileges, if
>> you need to be privileged during setup.
>> If someone asks for per-message metadata, they better ought not ask
>> for per-connection creds. It's not the information they're looking
>> for, so it will not match the data that at the time the message was
>> sent.
>>
>> There is no immediate need to make both match. For security decisions,
>> we mandate per-message creds. Per-connection creds are for
>> backwards-compatibility to dbus1 and for passive introspection of bus
>> connections.
>
> Backwards compatibility doesn't magically exempt security
> considerations.

No one is arguing that.

> If new code is insecure when talking to a legacy
> service, it's still insecure.
>
> [...]
>
>>> Again, you seem to be arguing that per-connection metadata is bad, but
>>> you still have an implementation of per-connection metadata, so you
>>> still have all these problems.
>>
>> I don't see why we get the problems of per-connection metadata. Just
>> because you _can_ use it, doesn't mean you should use it for all
>> imaginable use-cases. The same goes for reading information from
>> /proc. There are valid use-cases to do so, but also a lot of cases
>> where it will not provide the information you want.
>>
>
> Then you'll need to document really carefully which metadata is used
> for which service.  This actually seems impossible to do, since some
> services will exist in legacy and kdbus forms.
>
>>> I'm actually okay with per-message metadata in principle, but I'd like
>>> to see evidence (with numbers, please) that a send+recv of per-message
>>> metadata is *not* significantly slower than a recv of already-captured
>>> per-connection metadata.  If this is in fact the case, then maybe you
>>> should trash per-connection metadata instead and the legacy
>>> compatibility code can figure out a way to deal with it.  IMO that
>>> would be a pretty nice outcome, since you would never have to worry
>>> whether your connection to the bus is inadvertantly privileged.
>>
>> Per-message metadata makes SEND about 25% slower, if you transmit the
>> full set of all possible information. Just 3% if you only use
>> PIDs+UIDs. The expensive metadata is cgroup-path and exe-path.
>> If a service needs that information, however, and if that information
>> is not guaranteed to be up-to-date, the service _will_ go and look it
>> up in /proc or somewhere else, which is certainly a whole lot more
>> expensive than the code in kdbus.
>
> Can you give actual numbers, in ns or cycles, of how much overhead
> metadata adds?
>
>>
>> In general, there seems to be a number of misconception in this thread
>> about what kdbus is supposed to be. We're not inventing something new
>> here with a clean slate, but we're moving parts of an existing
>> implementation that has tons of users into the kernel, in order to fix
>> issues that cannot be fixed otherwise in userspace (most notably, the
>> race gaps that exist when retrieving per-message metadata). Therefore,
>> we have to keep existing semantics stable,

Re: [PATCH v4 00/14] Add kdbus implementation

2015-03-31 Thread Tom Gundersen

On Tue, Mar 31, 2015 at 3:58 PM, Andy Lutomirski l...@amacapital.net wrote:
 On Mon, Mar 30, 2015 at 9:56 AM, David Herrmann dh.herrm...@gmail.com wrote:
 Hi

 On Wed, Mar 25, 2015 at 7:12 PM, Andy Lutomirski l...@amacapital.net wrote:
 On Wed, Mar 25, 2015 at 10:29 AM, David Herrmann dh.herrm...@gmail.com 
 wrote:
 [...]
 I could be wrong about the lack of use cases.  If so, please enlighten me.

 We have several dbus APIs that allow clients to register as a special
 handler/controller/etc. (eg., see systemd-logind TakeControl()). The
 API provider checks the privileges of a client on registration and
 then just tracks the client ID. This way, the client can be privileged
 when asking for special access, then drop privileges and still use the
 interface. You cannot re-connect in between, as the API provider
 tracks your bus ID. Without message-metadata, all your (other) calls
 on this bus would always be treated as privileged. We *really* want to
 avoid this.

 Connect twice?

 The remote peer might cache your connection ID to track you. You have
 to use the same connection to talk to that peer, in this given
 scenario. That's how D-Bus1 is used today, and we have to follow the
 semantics.

 That's yet another reason that you really ought to disconnect and
 reconnect after a privilege change -- the remote peer might remember
 you.

 The might will be a big problem.  Users of kdbus can't rely on any
 particular concept of privilege because you have too many of them.


 You *already* have to reconnect or connect twice because you have
 per-connection metadata.  That's part of my problem with this scheme
 -- you support *both styles*, which seems like it'll give you most of
 the downsides of both without the upsides.

 Not necessarily. Connection metadata describes the state at the time
 you connected to the bus. If someone ask for this information, they
 will get exactly that. In this model, you cannot drop privileges, if
 you need to be privileged during setup.
 If someone asks for per-message metadata, they better ought not ask
 for per-connection creds. It's not the information they're looking
 for, so it will not match the data that at the time the message was
 sent.

 There is no immediate need to make both match. For security decisions,
 we mandate per-message creds. Per-connection creds are for
 backwards-compatibility to dbus1 and for passive introspection of bus
 connections.

 Backwards compatibility doesn't magically exempt security
 considerations.

No one is arguing that.

 If new code is insecure when talking to a legacy
 service, it's still insecure.

 [...]

 Again, you seem to be arguing that per-connection metadata is bad, but
 you still have an implementation of per-connection metadata, so you
 still have all these problems.

 I don't see why we get the problems of per-connection metadata. Just
 because you _can_ use it, doesn't mean you should use it for all
 imaginable use-cases. The same goes for reading information from
 /proc. There are valid use-cases to do so, but also a lot of cases
 where it will not provide the information you want.


 Then you'll need to document really carefully which metadata is used
 for which service.  This actually seems impossible to do, since some
 services will exist in legacy and kdbus forms.

 I'm actually okay with per-message metadata in principle, but I'd like
 to see evidence (with numbers, please) that a send+recv of per-message
 metadata is *not* significantly slower than a recv of already-captured
 per-connection metadata.  If this is in fact the case, then maybe you
 should trash per-connection metadata instead and the legacy
 compatibility code can figure out a way to deal with it.  IMO that
 would be a pretty nice outcome, since you would never have to worry
 whether your connection to the bus is inadvertantly privileged.

 Per-message metadata makes SEND about 25% slower, if you transmit the
 full set of all possible information. Just 3% if you only use
 PIDs+UIDs. The expensive metadata is cgroup-path and exe-path.
 If a service needs that information, however, and if that information
 is not guaranteed to be up-to-date, the service _will_ go and look it
 up in /proc or somewhere else, which is certainly a whole lot more
 expensive than the code in kdbus.

 Can you give actual numbers, in ns or cycles, of how much overhead
 metadata adds?


 In general, there seems to be a number of misconception in this thread
 about what kdbus is supposed to be. We're not inventing something new
 here with a clean slate, but we're moving parts of an existing
 implementation that has tons of users into the kernel, in order to fix
 issues that cannot be fixed otherwise in userspace (most notably, the
 race gaps that exist when retrieving per-message metadata). Therefore,
 we have to keep existing semantics stable, otherwise the exercise is
 somewhat pointless.

 IOW you're taking something that you dislike aspects of and shoving
 most of it in the kernel.

[PATCH] net: nl80211 - pass name_assign_type to rdev_add_virtual_intf()

2015-03-18 Thread Tom Gundersen

This will expose in /sys whether the ifname of a device is set by userspace
or generated by the kernel. The latter kind (wlanX, etc) is not deterministic,
so userspace needs to rename these devices to names that are guaranteed to
stay the same between reboots. The former, however should never be renamed,
so userspace needs to be able to reliably tell the difference.

Similar functionality was introduced for the rtnetlink core in commit 5517750.

Signed-off-by: Tom Gundersen 
Cc: Kalle Valo 
Cc: Brett Rudley 
Cc: Arend van Spriel 
Cc: Franky (Zhenhui) Lin 
Cc: Hante Meuleman 
Cc: Johannes Berg 
---

Hi Johannes,

This patch was originally part of a bigger series from last year, but I decided
to split it out and resend it alone. You had a comment back then about using
enums instead of unsigned char in the method signatures. Both sound fine to me,
but I opted for keeping unsigned char for the sake of uniformity with the 
similar
code in the core.

If this looks ok to you, could you take it through the wireless tree, or do you
want me to resend it to Dave directly?

Cheers,

Tom

 drivers/net/wireless/ath/ath6kl/cfg80211.c | 6 --
 drivers/net/wireless/ath/ath6kl/cfg80211.h | 1 +
 drivers/net/wireless/ath/ath6kl/core.c | 4 ++--
 drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c | 3 ++-
 drivers/net/wireless/brcm80211/brcmfmac/p2p.c  | 3 +++
 drivers/net/wireless/brcm80211/brcmfmac/p2p.h  | 1 +
 drivers/net/wireless/mwifiex/cfg80211.c| 5 +++--
 drivers/net/wireless/mwifiex/main.c| 6 +++---
 drivers/net/wireless/mwifiex/main.h| 1 +
 drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c  | 6 +-
 include/net/cfg80211.h | 1 +
 net/mac80211/cfg.c | 3 ++-
 net/mac80211/ieee80211_i.h | 1 +
 net/mac80211/iface.c   | 3 ++-
 net/mac80211/main.c| 2 +-
 net/wireless/nl80211.c | 3 ++-
 net/wireless/rdev-ops.h| 5 +++--
 17 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c 
b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 85da63a..0cb5433 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -1495,6 +1495,7 @@ static int ath6kl_cfg80211_set_power_mgmt(struct wiphy 
*wiphy,
 
 static struct wireless_dev *ath6kl_cfg80211_add_iface(struct wiphy *wiphy,
  const char *name,
+ unsigned char 
name_assign_type,
  enum nl80211_iftype type,
  u32 *flags,
  struct vif_params *params)
@@ -1513,7 +1514,7 @@ static struct wireless_dev 
*ath6kl_cfg80211_add_iface(struct wiphy *wiphy,
return ERR_PTR(-EINVAL);
}
 
-   wdev = ath6kl_interface_add(ar, name, type, if_idx, nw_type);
+   wdev = ath6kl_interface_add(ar, name, name_assign_type, type, if_idx, 
nw_type);
if (!wdev)
return ERR_PTR(-ENOMEM);
 
@@ -3633,13 +3634,14 @@ void ath6kl_cfg80211_vif_cleanup(struct ath6kl_vif *vif)
 }
 
 struct wireless_dev *ath6kl_interface_add(struct ath6kl *ar, const char *name,
+ unsigned char name_assign_type,
  enum nl80211_iftype type,
  u8 fw_vif_idx, u8 nw_type)
 {
struct net_device *ndev;
struct ath6kl_vif *vif;
 
-   ndev = alloc_netdev(sizeof(*vif), name, NET_NAME_UNKNOWN, ether_setup);
+   ndev = alloc_netdev(sizeof(*vif), name, name_assign_type, ether_setup);
if (!ndev)
return NULL;
 
diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.h 
b/drivers/net/wireless/ath/ath6kl/cfg80211.h
index b59becd..5aa57a7 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.h
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.h
@@ -25,6 +25,7 @@ enum ath6kl_cfg_suspend_mode {
 };
 
 struct wireless_dev *ath6kl_interface_add(struct ath6kl *ar, const char *name,
+ unsigned char name_assign_type,
  enum nl80211_iftype type,
  u8 fw_vif_idx, u8 nw_type);
 void ath6kl_cfg80211_ch_switch_notify(struct ath6kl_vif *vif, int freq,
diff --git a/drivers/net/wireless/ath/ath6kl/core.c 
b/drivers/net/wireless/ath/ath6kl/core.c
index 0df74b2..4ec02ce 100644
--- a/drivers/net/wireless/ath/ath6kl/core.c
+++ b/drivers/net/wireless/ath/ath6kl/core.c
@@ -211,8 +211,8 @@ int ath6kl_core_init(struct ath6kl *ar, enum 
ath6kl_htc_type htc_type)
rtnl_lock();
 
/* Add an initial station

[PATCH] net: nl80211 - pass name_assign_type to rdev_add_virtual_intf()

2015-03-18 Thread Tom Gundersen

This will expose in /sys whether the ifname of a device is set by userspace
or generated by the kernel. The latter kind (wlanX, etc) is not deterministic,
so userspace needs to rename these devices to names that are guaranteed to
stay the same between reboots. The former, however should never be renamed,
so userspace needs to be able to reliably tell the difference.

Similar functionality was introduced for the rtnetlink core in commit 5517750.

Signed-off-by: Tom Gundersen t...@jklm.no
Cc: Kalle Valo kv...@qca.qualcomm.com
Cc: Brett Rudley brud...@broadcom.com
Cc: Arend van Spriel ar...@broadcom.com
Cc: Franky (Zhenhui) Lin fran...@broadcom.com
Cc: Hante Meuleman meule...@broadcom.com
Cc: Johannes Berg johan...@sipsolutions.net
---

Hi Johannes,

This patch was originally part of a bigger series from last year, but I decided
to split it out and resend it alone. You had a comment back then about using
enums instead of unsigned char in the method signatures. Both sound fine to me,
but I opted for keeping unsigned char for the sake of uniformity with the 
similar
code in the core.

If this looks ok to you, could you take it through the wireless tree, or do you
want me to resend it to Dave directly?

Cheers,

Tom

 drivers/net/wireless/ath/ath6kl/cfg80211.c | 6 --
 drivers/net/wireless/ath/ath6kl/cfg80211.h | 1 +
 drivers/net/wireless/ath/ath6kl/core.c | 4 ++--
 drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c | 3 ++-
 drivers/net/wireless/brcm80211/brcmfmac/p2p.c  | 3 +++
 drivers/net/wireless/brcm80211/brcmfmac/p2p.h  | 1 +
 drivers/net/wireless/mwifiex/cfg80211.c| 5 +++--
 drivers/net/wireless/mwifiex/main.c| 6 +++---
 drivers/net/wireless/mwifiex/main.h| 1 +
 drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c  | 6 +-
 include/net/cfg80211.h | 1 +
 net/mac80211/cfg.c | 3 ++-
 net/mac80211/ieee80211_i.h | 1 +
 net/mac80211/iface.c   | 3 ++-
 net/mac80211/main.c| 2 +-
 net/wireless/nl80211.c | 3 ++-
 net/wireless/rdev-ops.h| 5 +++--
 17 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c 
b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 85da63a..0cb5433 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -1495,6 +1495,7 @@ static int ath6kl_cfg80211_set_power_mgmt(struct wiphy 
*wiphy,
 
 static struct wireless_dev *ath6kl_cfg80211_add_iface(struct wiphy *wiphy,
  const char *name,
+ unsigned char 
name_assign_type,
  enum nl80211_iftype type,
  u32 *flags,
  struct vif_params *params)
@@ -1513,7 +1514,7 @@ static struct wireless_dev 
*ath6kl_cfg80211_add_iface(struct wiphy *wiphy,
return ERR_PTR(-EINVAL);
}
 
-   wdev = ath6kl_interface_add(ar, name, type, if_idx, nw_type);
+   wdev = ath6kl_interface_add(ar, name, name_assign_type, type, if_idx, 
nw_type);
if (!wdev)
return ERR_PTR(-ENOMEM);
 
@@ -3633,13 +3634,14 @@ void ath6kl_cfg80211_vif_cleanup(struct ath6kl_vif *vif)
 }
 
 struct wireless_dev *ath6kl_interface_add(struct ath6kl *ar, const char *name,
+ unsigned char name_assign_type,
  enum nl80211_iftype type,
  u8 fw_vif_idx, u8 nw_type)
 {
struct net_device *ndev;
struct ath6kl_vif *vif;
 
-   ndev = alloc_netdev(sizeof(*vif), name, NET_NAME_UNKNOWN, ether_setup);
+   ndev = alloc_netdev(sizeof(*vif), name, name_assign_type, ether_setup);
if (!ndev)
return NULL;
 
diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.h 
b/drivers/net/wireless/ath/ath6kl/cfg80211.h
index b59becd..5aa57a7 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.h
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.h
@@ -25,6 +25,7 @@ enum ath6kl_cfg_suspend_mode {
 };
 
 struct wireless_dev *ath6kl_interface_add(struct ath6kl *ar, const char *name,
+ unsigned char name_assign_type,
  enum nl80211_iftype type,
  u8 fw_vif_idx, u8 nw_type);
 void ath6kl_cfg80211_ch_switch_notify(struct ath6kl_vif *vif, int freq,
diff --git a/drivers/net/wireless/ath/ath6kl/core.c 
b/drivers/net/wireless/ath/ath6kl/core.c
index 0df74b2..4ec02ce 100644
--- a/drivers/net/wireless/ath/ath6kl/core.c
+++ b/drivers/net/wireless/ath/ath6kl/core.c
@@ -211,8 +211,8

Re: [PATCH 01/13] kdbus: add documentation

2015-01-26 Thread Tom Gundersen

Hi Michael,

On Mon, Jan 26, 2015 at 3:42 PM, Michael Kerrisk (man-pages)
 wrote:
> 2. Is the API to be invoked directly by applications or is intended to
>be used only behind specific libraries? You seem to be saying that
>the latter is the case (here, I'm referring to your comment above
>about sd-bus). However, when I asked David Herrmann a similar
>question I got this responser:
>
>   "kdbus is in no way bound to systemd. There are ongoing efforts
>to port glib and qt to kdbus natively. The API is pretty simple
>and I don't see how a libkdbus would simplify things. In fact,
>even our tests only have slim wrappers around the ioctls to
>simplify error-handling in test-scenarios."
>
>To me, that implies that users will employ the raw kernel API.

The way I read this is that there will (probably) be a handful of
users, namely the existing dbus libraries: libdus, sd-bus, glib, Qt,
ell, and maybe a few others. However, third-party developers will not
know/care about the details of kdbus, they'll just be coding against
the dbus libraries as before (might be minor changes, but they
certainly won't need to know anything about the kernel API). Similarly
to how userspace developers now code against their libc of choice,
rather than use kernel syscalls directly.

HTH,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 01/13] kdbus: add documentation

2015-01-26 Thread Tom Gundersen

Hi Michael,

On Mon, Jan 26, 2015 at 3:42 PM, Michael Kerrisk (man-pages)
mtk.manpa...@gmail.com wrote:
 2. Is the API to be invoked directly by applications or is intended to
be used only behind specific libraries? You seem to be saying that
the latter is the case (here, I'm referring to your comment above
about sd-bus). However, when I asked David Herrmann a similar
question I got this responser:

   kdbus is in no way bound to systemd. There are ongoing efforts
to port glib and qt to kdbus natively. The API is pretty simple
and I don't see how a libkdbus would simplify things. In fact,
even our tests only have slim wrappers around the ioctls to
simplify error-handling in test-scenarios.

To me, that implies that users will employ the raw kernel API.

The way I read this is that there will (probably) be a handful of
users, namely the existing dbus libraries: libdus, sd-bus, glib, Qt,
ell, and maybe a few others. However, third-party developers will not
know/care about the details of kdbus, they'll just be coding against
the dbus libraries as before (might be minor changes, but they
certainly won't need to know anything about the kernel API). Similarly
to how userspace developers now code against their libc of choice,
rather than use kernel syscalls directly.

HTH,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: udev/hwdb support in pciutils (Re: [PATCH RESEND][pciutils] libpci: pci_id_lookup - add udev/hwdb support)

2014-11-03 Thread Tom Gundersen

Hi Martin,

On Sat, Nov 1, 2014 at 6:51 PM, Martin Mares  wrote:
> The support for hwdb has finally landed in mainline pciutils.
>
> I have modified your patch to bring it closer to the rest of device
> naming logic. Namely:

Great! Thanks for merging this, and for working on it.

> Aside of the README, everything should be ready for a public release.
> Could you please verify that it still works for you?

I can verify that it still works as expected, so it is good to go as
far as I'm concerned.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: udev/hwdb support in pciutils (Re: [PATCH RESEND][pciutils] libpci: pci_id_lookup - add udev/hwdb support)

2014-11-03 Thread Tom Gundersen

Hi Martin,

On Sat, Nov 1, 2014 at 6:51 PM, Martin Mares m...@ucw.cz wrote:
 The support for hwdb has finally landed in mainline pciutils.

 I have modified your patch to bring it closer to the rest of device
 naming logic. Namely:

Great! Thanks for merging this, and for working on it.

 Aside of the README, everything should be ready for a public release.
 Could you please verify that it still works for you?

I can verify that it still works as expected, so it is good to go as
far as I'm concerned.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 00/12] Add kdbus implementation

2014-10-30 Thread Tom Gundersen

On 10/30/2014 12:55 AM, Andy Lutomirski wrote:> It's worth noting that:
>
>  - Proper credential passing could be added to UNIX sockets, and we
> may want to do that anyway.  Also, the current kdbus semantics seem to
> be "spew lots of credentials and other miscellaneous
> potentially-sensitive and sometime spoofable information all over the
> place", which isn't obviously an improvement.  (This is fixable, but
> it will almost certainly not be compatible with current systemd kdbus
> code if fixed.)

Care to elaborate on what you think is spoofable, and what needs to be fixed?

Anyway, the idea is that by simply connecting to the bus and sending a
message to some service, you implicitly agree to passing some metadata
along to the service (and to a lesser extent to the bus). It's not
that this information is leaked, or that the peer could actively
access any of the sender's private memory. Also note that this kind of
metadata information is also available via /proc/$PID, and via
SCM_CREDENTIALS/SO_PEERCRED and the socket seclabel APIs. What the
kdbus API allows users to do is to get a lot more of this information
in a race-free way. For example, if you want to get the audit identity
bits, you can now get this attached securely by the kernel, at the
time the message is sent, rather than having to firest get the peer's
$PID from SCM_CREDENTIALS and then read the audit identity bits racily
from /proc/$PID/loginuid and /proc/$PID/sessionid.

>  - The current kdbus patches seem to be worse than UNIX sockets from a
> namespace perspective, but maybe I'm misunderstanding how it's
> supposed to work.  UNIX sockets work quite nicely in containers.

kdbus is recusively stackable for containers. You can run
kdbus-enabled containers within kdbus-enabled containers within
kdbus-enabled containers, with the full functionality available for
each container, and each container isolated from each other.

When credential information is passed between processes of different
(PID) namespaces most of the attached metadata is suppressed. This
isn't too different from how SCM_CREDENTIALS works, which will zero
out the bits it cannot translate as well.

>  - There's an obvious interface to add timestamping to UNIX sockets
> (it could work exactly the way it does for UDP / PTP).

Timestamping on AF_UNIX/SOCK_DGRAM already exists, but that's not
enough for the use-cases we want to support.

>  - I'm unconvinced by this performance argument without numbers.  The
> kdbus credential code, at least, looks to be quite heavy on allocation
> and atomics.  This isn't to say that the current userspace D-Bus
> daemon doesn't also serialize everything, but it could be made
> multithreaded.

There are some major benefits regarding performance:

* fewer userspace context switches. For a full-duplex method call it's
down from five to two: instead of sender -> dbus daemon -> service ->
dbus daemon -> sender it's just sender -> service -> sender.
* fewer message copies in userspace. For a full-duplex method call
it's down from eight to two: instead of copying the method call data
into a socket, out of a socket, into a socket, out of a socket, and
the same for the method reply, we just copy one message directly to
the receiver, and the reply back.
* generally fewer syscalls involved. A synchronous method call is now
doable in a single ioctl on the sender side.
* memfds can be used for transport purposes of larger payload. This
way, we can cover substantial payload sizes instead of just small
control messages, with no extra copies. kdbus, in its transport layer,
makes sure only sealed memfds are passed in as payload, so the sender
cannot modify the contents while the receiver is already parsing it.

>  - Race-free?  What are the races that are inherent to UNIX sockets?

Does the above explain what we have in mind?

Note that the aim is not necessarily that kdbus should be better than
UNIX sockets in every way, nor that it should be favoured in all
cases. What we are trying to address is a common case in environments
where peers don't necessarily trust each other.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: kdbus: add header file

2014-10-30 Thread Tom Gundersen

On Thu, Oct 30, 2014 at 9:20 AM, Arnd Bergmann  wrote:
> I think in general, using enum is great, but for ioctl command numbers,
> we probably want to have defines so the user space implementation can
> use #ifdef to see if the kernel version that it is being built for
> knows a particular command.

Does that make sense for the first version? I agree that we should use
 #define to allow #ifdef for when we add more ioctls in the future,
but these ioctls will always exist...

The nice thing about enums is of course that it helps with debugging
as gdb can show the string representation rather than the number,
because in contrast to #defines, an enum is something the compliler
knows about.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 00/12] Add kdbus implementation

2014-10-30 Thread Tom Gundersen

Hi Eric,

On Thu, Oct 30, 2014 at 5:20 AM, Eric W. Biederman
 wrote:
> The userspace API breaks userspace in an unfixable way.
>
> Nacked-by: "Eric W. Biederman" 
>
> Problem the first.
> - Using global names for containers makes it impossible to create
>   unprivileged containers.

I don't follow.

Just so we are on the same page:
  - creating a domain per container is only a convention, and has to
be done manually. I.e., the worst case scenario is that you are able
to create some container which cannot get a corresponding kdbus
domain.
  - domain names are only unique per parent-domain, and domains are
fully recursive. We explicitly tested recursive domains by running
kdbus-enabled containers within kdbus-enabled containers, a number of
iterations deep.

Could you explain the problem you see in more detail? This might just
be a documenation issue, after all.

>   This is a back to the drawing board problem, and makes device
>   nodes fundamentally unsuited to what you are doing.
>
>   There is no way that I can see to make it safe for an unprivileged
>   user to create arbitrary named busses.  Especially in the presence
>   of allowing unprivileged checkpoint/restart.

Note that unprivileged users cannot create arbitrary named busses, the
names must have the format $PID-. Do you see a problem
with this?

>   This is particularly bad as kdbus explicitly allows unprivielged
>   creation of new kdbus instances.

What do you mean by kdbus instance? A new domain? This is not allowed
by unprivileged processes. Or do you mean a new bus, in which case see
above.

>   This problem is a userspace regression.

This is all new functionality, how does it affect current code?

> Problem the second.
> - The security checks in the code are not based on who opens the
>   file descriptors but instead based on who is used the file
>   descriptors at any give moment.
>
>   That pattern has been shown to be exploitable.
>
>   I expect the policy database makes this poor choice of permission
>   checks even worse.  Pass a more privileged user a kdbus file
>   descriptor and all of sudden things that were not possible on
>   that file descriptor become possible.

Djalal already commented on this point in another thread. But just to
recap: Please note that we do not do read()/write() at all, only
ioctl's, so the most common exploits do not apply. Moreover, we are
following the same API pattern as used by other similar APIs in the
kernel. With that in mind, could you give some more specific
information about what kind of exploits you imagine?

> Problem the third.
> - You are using device numbers for things created by unprivileged
>   users.  That breaks checkpoint/restart.  Aka CRIU.
>
>   We can not migrate a container to a new machine and preserve the
>   device numbers.

I must admit to not being too familiar with checkpoint/restart. What
precisely is the problem with unprivileged users?

>   We can not migrate a container to a new machine and have any hope
>   of preserving the container patsh under /dev/kdbus/...

You may not be able to preserve the full path, no, but the container
should not know/care about the parent paths anyway.  Note that the
containers only see their own domain subtree mounted to /dev/kdbus,
they see nothing from the parent. Hence when you migrate containers
you can change the naming of the parent freely, but the processes
inside the containers won't see that, they'll have stable paths.   I'm
not seeing the problem here, care to elaborate?

> I think a kdbusfs modeled on devpts with newinstance at
> mount time would solve the naming problems.

Effectively, what we have in place in the current patch set delivers
similar semantics, however without introducing a new file system. You
just create a new domain and get a new subdir in /dev/kdbus/ for it,
and then inside the container you mount that subdir of /dev/kdbus onto
/dev/kdbus itself.

Do I understand you correctly that what you want is unnamed/anonymous
domains? Considering that domain creation is anyway privileged, why is
this necessary?

> That would break one of the current kdbus use cases that allows an
> unprivileged user to create a bus.

That is a fundamental usecase, so I don't think it makes much sense to
do anything that precludes that.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 00/12] Add kdbus implementation

2014-10-30 Thread Tom Gundersen

Hi Eric,

On Thu, Oct 30, 2014 at 5:20 AM, Eric W. Biederman
ebied...@xmission.com wrote:
 The userspace API breaks userspace in an unfixable way.

 Nacked-by: Eric W. Biederman ebied...@xmission.com

 Problem the first.
 - Using global names for containers makes it impossible to create
   unprivileged containers.

I don't follow.

Just so we are on the same page:
  - creating a domain per container is only a convention, and has to
be done manually. I.e., the worst case scenario is that you are able
to create some container which cannot get a corresponding kdbus
domain.
  - domain names are only unique per parent-domain, and domains are
fully recursive. We explicitly tested recursive domains by running
kdbus-enabled containers within kdbus-enabled containers, a number of
iterations deep.

Could you explain the problem you see in more detail? This might just
be a documenation issue, after all.

   This is a back to the drawing board problem, and makes device
   nodes fundamentally unsuited to what you are doing.

   There is no way that I can see to make it safe for an unprivileged
   user to create arbitrary named busses.  Especially in the presence
   of allowing unprivileged checkpoint/restart.

Note that unprivileged users cannot create arbitrary named busses, the
names must have the format $PID-arbitrary name. Do you see a problem
with this?

   This is particularly bad as kdbus explicitly allows unprivielged
   creation of new kdbus instances.

What do you mean by kdbus instance? A new domain? This is not allowed
by unprivileged processes. Or do you mean a new bus, in which case see
above.

   This problem is a userspace regression.

This is all new functionality, how does it affect current code?

 Problem the second.
 - The security checks in the code are not based on who opens the
   file descriptors but instead based on who is used the file
   descriptors at any give moment.

   That pattern has been shown to be exploitable.

   I expect the policy database makes this poor choice of permission
   checks even worse.  Pass a more privileged user a kdbus file
   descriptor and all of sudden things that were not possible on
   that file descriptor become possible.

Djalal already commented on this point in another thread. But just to
recap: Please note that we do not do read()/write() at all, only
ioctl's, so the most common exploits do not apply. Moreover, we are
following the same API pattern as used by other similar APIs in the
kernel. With that in mind, could you give some more specific
information about what kind of exploits you imagine?

 Problem the third.
 - You are using device numbers for things created by unprivileged
   users.  That breaks checkpoint/restart.  Aka CRIU.

   We can not migrate a container to a new machine and preserve the
   device numbers.

I must admit to not being too familiar with checkpoint/restart. What
precisely is the problem with unprivileged users?

   We can not migrate a container to a new machine and have any hope
   of preserving the container patsh under /dev/kdbus/...

You may not be able to preserve the full path, no, but the container
should not know/care about the parent paths anyway.  Note that the
containers only see their own domain subtree mounted to /dev/kdbus,
they see nothing from the parent. Hence when you migrate containers
you can change the naming of the parent freely, but the processes
inside the containers won't see that, they'll have stable paths.   I'm
not seeing the problem here, care to elaborate?

 I think a kdbusfs modeled on devpts with newinstance at
 mount time would solve the naming problems.

Effectively, what we have in place in the current patch set delivers
similar semantics, however without introducing a new file system. You
just create a new domain and get a new subdir in /dev/kdbus/ for it,
and then inside the container you mount that subdir of /dev/kdbus onto
/dev/kdbus itself.

Do I understand you correctly that what you want is unnamed/anonymous
domains? Considering that domain creation is anyway privileged, why is
this necessary?

 That would break one of the current kdbus use cases that allows an
 unprivileged user to create a bus.

That is a fundamental usecase, so I don't think it makes much sense to
do anything that precludes that.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: kdbus: add header file

2014-10-30 Thread Tom Gundersen

On Thu, Oct 30, 2014 at 9:20 AM, Arnd Bergmann a...@arndb.de wrote:
 I think in general, using enum is great, but for ioctl command numbers,
 we probably want to have defines so the user space implementation can
 use #ifdef to see if the kernel version that it is being built for
 knows a particular command.

Does that make sense for the first version? I agree that we should use
 #define to allow #ifdef for when we add more ioctls in the future,
but these ioctls will always exist...

The nice thing about enums is of course that it helps with debugging
as gdb can show the string representation rather than the number,
because in contrast to #defines, an enum is something the compliler
knows about.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 00/12] Add kdbus implementation

2014-10-30 Thread Tom Gundersen

On 10/30/2014 12:55 AM, Andy Lutomirski wrote: It's worth noting that:

  - Proper credential passing could be added to UNIX sockets, and we
 may want to do that anyway.  Also, the current kdbus semantics seem to
 be spew lots of credentials and other miscellaneous
 potentially-sensitive and sometime spoofable information all over the
 place, which isn't obviously an improvement.  (This is fixable, but
 it will almost certainly not be compatible with current systemd kdbus
 code if fixed.)

Care to elaborate on what you think is spoofable, and what needs to be fixed?

Anyway, the idea is that by simply connecting to the bus and sending a
message to some service, you implicitly agree to passing some metadata
along to the service (and to a lesser extent to the bus). It's not
that this information is leaked, or that the peer could actively
access any of the sender's private memory. Also note that this kind of
metadata information is also available via /proc/$PID, and via
SCM_CREDENTIALS/SO_PEERCRED and the socket seclabel APIs. What the
kdbus API allows users to do is to get a lot more of this information
in a race-free way. For example, if you want to get the audit identity
bits, you can now get this attached securely by the kernel, at the
time the message is sent, rather than having to firest get the peer's
$PID from SCM_CREDENTIALS and then read the audit identity bits racily
from /proc/$PID/loginuid and /proc/$PID/sessionid.

  - The current kdbus patches seem to be worse than UNIX sockets from a
 namespace perspective, but maybe I'm misunderstanding how it's
 supposed to work.  UNIX sockets work quite nicely in containers.

kdbus is recusively stackable for containers. You can run
kdbus-enabled containers within kdbus-enabled containers within
kdbus-enabled containers, with the full functionality available for
each container, and each container isolated from each other.

When credential information is passed between processes of different
(PID) namespaces most of the attached metadata is suppressed. This
isn't too different from how SCM_CREDENTIALS works, which will zero
out the bits it cannot translate as well.

  - There's an obvious interface to add timestamping to UNIX sockets
 (it could work exactly the way it does for UDP / PTP).

Timestamping on AF_UNIX/SOCK_DGRAM already exists, but that's not
enough for the use-cases we want to support.

  - I'm unconvinced by this performance argument without numbers.  The
 kdbus credential code, at least, looks to be quite heavy on allocation
 and atomics.  This isn't to say that the current userspace D-Bus
 daemon doesn't also serialize everything, but it could be made
 multithreaded.

There are some major benefits regarding performance:

* fewer userspace context switches. For a full-duplex method call it's
down from five to two: instead of sender - dbus daemon - service -
dbus daemon - sender it's just sender - service - sender.
* fewer message copies in userspace. For a full-duplex method call
it's down from eight to two: instead of copying the method call data
into a socket, out of a socket, into a socket, out of a socket, and
the same for the method reply, we just copy one message directly to
the receiver, and the reply back.
* generally fewer syscalls involved. A synchronous method call is now
doable in a single ioctl on the sender side.
* memfds can be used for transport purposes of larger payload. This
way, we can cover substantial payload sizes instead of just small
control messages, with no extra copies. kdbus, in its transport layer,
makes sure only sealed memfds are passed in as payload, so the sender
cannot modify the contents while the receiver is already parsing it.

  - Race-free?  What are the races that are inherent to UNIX sockets?

Does the above explain what we have in mind?

Note that the aim is not necessarily that kdbus should be better than
UNIX sockets in every way, nor that it should be favoured in all
cases. What we are trying to address is a common case in environments
where peers don't necessarily trust each other.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [systemd-devel] [RFC v2 3/6] kthread: warn on kill signal if not OOM

2014-10-10 Thread Tom Gundersen

On Fri, Oct 10, 2014 at 11:54 PM, Anatol Pomozov
 wrote:
> 1) Why not to make the timeout configurable through config file? There
> is already udev.conf you can put config option there. Thus people with
> modprobe issues can easily "fix" the problem. And then decrease
> default timeout back to 30 seconds. I agree that long module loading
> (more than 30 secs) is abnormal and should be investigated by driver
> authors.

We can already configure this either on the udev or kernel
commandline, is that not sufficient (I don't object to also adding it
to the config file, just asking)?

> 2) Could you add 'echo w > /proc/sysrq-trigger' to udev code right
> before killing the "modprobe" thread? sysrq will print information
> about stuck threads (including modprobe itself) this will make
> debugging easier. e.g. dmesg here
> https://bugs.archlinux.org/task/40454 says nothing where the threads
> were stuck.

Are the current warnings (in udev git) sufficient (should tell you
which module is taking long, but still won't tell you which kernel
thread of course)?

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [systemd-devel] [RFC v2 3/6] kthread: warn on kill signal if not OOM

2014-10-10 Thread Tom Gundersen

On Fri, Oct 10, 2014 at 11:54 PM, Anatol Pomozov
anatol.pomo...@gmail.com wrote:
 1) Why not to make the timeout configurable through config file? There
 is already udev.conf you can put config option there. Thus people with
 modprobe issues can easily fix the problem. And then decrease
 default timeout back to 30 seconds. I agree that long module loading
 (more than 30 secs) is abnormal and should be investigated by driver
 authors.

We can already configure this either on the udev or kernel
commandline, is that not sufficient (I don't object to also adding it
to the config file, just asking)?

 2) Could you add 'echo w  /proc/sysrq-trigger' to udev code right
 before killing the modprobe thread? sysrq will print information
 about stuck threads (including modprobe itself) this will make
 debugging easier. e.g. dmesg here
 https://bugs.archlinux.org/task/40454 says nothing where the threads
 were stuck.

Are the current warnings (in udev git) sufficient (should tell you
which module is taking long, but still won't tell you which kernel
thread of course)?

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v1 5/5] driver-core: add driver asynchronous probe support

2014-10-03 Thread Tom Gundersen

On Thu, Oct 2, 2014 at 10:06 PM, Luis R. Rodriguez  wrote:
> On Thu, Oct 02, 2014 at 08:12:37AM +0200, Tom Gundersen wrote:
>> Making kmod a special case is of course possible. However, as long as
>> there is no fundamental reason why kmod should get this special
>> treatment, this just looks like a work-around to me.
>
> I've mentioned a series of five reasons why its a bad idea right now to
> sigkill modules [0], we're reviewed them each and still at least
> items 2-4 remain particularly valid fundamental reasons to avoid it

So items 2-4 basically say "there currently are drivers that cannot
deal with sigkill after a three minute timeout".

In the short-term we already have the solution: increase the timeout.
In the long-term, we have two choices, either permanently add some
heuristic to udev to deal with drivers taking a very long time to be
inserted, or fix the drivers not to take such a long time. A priori,
it makes no sense to me that drivers spend unbounded amounts of time
to get inserted, so fixing the drivers seems like the most reasonable
approach to me. That said, I'm of course open to be proven wrong if
there are some drivers that fundamentally _must_ take a long time to
insert (but we should then discuss why that is and how we can best
deal with the situation, rather than adding some hack up-front when we
don't even know if it is needed).

Your patch series should go a long way towards fixing the drivers (and
I imagine there being a lot of low-hanging fruit that can easily be
fixed once your series has landed), and the fact that we have now
increased the udev timeout from 30 to 180 seconds should also greatly
reduce the problem.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v1 5/5] driver-core: add driver asynchronous probe support

2014-10-03 Thread Tom Gundersen

On Thu, Oct 2, 2014 at 10:06 PM, Luis R. Rodriguez mcg...@suse.com wrote:
 On Thu, Oct 02, 2014 at 08:12:37AM +0200, Tom Gundersen wrote:
 Making kmod a special case is of course possible. However, as long as
 there is no fundamental reason why kmod should get this special
 treatment, this just looks like a work-around to me.

 I've mentioned a series of five reasons why its a bad idea right now to
 sigkill modules [0], we're reviewed them each and still at least
 items 2-4 remain particularly valid fundamental reasons to avoid it

So items 2-4 basically say there currently are drivers that cannot
deal with sigkill after a three minute timeout.

In the short-term we already have the solution: increase the timeout.
In the long-term, we have two choices, either permanently add some
heuristic to udev to deal with drivers taking a very long time to be
inserted, or fix the drivers not to take such a long time. A priori,
it makes no sense to me that drivers spend unbounded amounts of time
to get inserted, so fixing the drivers seems like the most reasonable
approach to me. That said, I'm of course open to be proven wrong if
there are some drivers that fundamentally _must_ take a long time to
insert (but we should then discuss why that is and how we can best
deal with the situation, rather than adding some hack up-front when we
don't even know if it is needed).

Your patch series should go a long way towards fixing the drivers (and
I imagine there being a lot of low-hanging fruit that can easily be
fixed once your series has landed), and the fact that we have now
increased the udev timeout from 30 to 180 seconds should also greatly
reduce the problem.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v1 5/5] driver-core: add driver asynchronous probe support

2014-10-02 Thread Tom Gundersen

On Tue, Sep 30, 2014 at 5:24 PM, Luis R. Rodriguez  wrote:
>> > commit e64fae5573e566ce4fd9b23c68ac8f3096603314
>> > Author: Kay Sievers 
>> > Date:   Wed Jan 18 05:06:18 2012 +0100
>> >
>> > udevd: kill hanging event processes after 30 seconds
>> >
>> > Some broken kernel drivers load firmware synchronously in the module 
>> > init
>> > path and block modprobe until the firmware request is fulfilled.
>> > <...>
>>
>> This was a workaround to avoid a deadlock between udev and the kernel.
>> The 180 s timeout was already in place before this change, and was not
>> motivated by firmware loading. Also note that this patch was not about
>> "tracking device drivers", just about avoiding dead-lock.
>
> Thanks, can you elaborate on how a deadlock can occur if the kmod
> worker is not at some point sigkilled?

This was only relevant whet udev did the firmware loading. modprobe
would wait for the kernel, which would wait for the firmware loading,
which would wait for modprobe. This is no longer a problem as udev
does not do firmware loading any more.

> Is the issue that if there is no extra worker available and all are
> idling on sleep / synchronous long work boot will potentially hang
> unless a new worker becomes available to do more work?

Correct.

> If so I can
> see the sigkill helping for hanging tasks but it doesn't necessarily
> mean its a good idea to kill modules loading taking a while. Also
> what if the sigkill is just avoided for *just* kmod workers?

Depending on the number of devices you have, I suppose we could still
exhaust the workers.

>> The way I see it, the current status from systemd's side is: our
>> short-term work-around is to increase the timeout, and at the moment
>> it appears no long-term solution is needed (i.e., it seems like the
>> right thing to do is to make sure insmod can be near instantaneous, it
>> appears people are working towards this goal, and so far no examples
>> have cropped up showing that it is fundamentally impossible (once/if
>> they do, we should of course revisit the problem)).
>
> That again would be reactive behaviour, what would prevent avoiding the
> sigkill only for kmod workers? Is it known the deadlock is immiment?
> If the amount of workers for kmod that would hit the timeout is
> considered low I don't see how that's possible and why not just lift
> the sigkill.

Making kmod a special case is of course possible. However, as long as
there is no fundamental reason why kmod should get this special
treatment, this just looks like a work-around to me. We already have a
work-around, which is to increase the global timeout. If you still
think we should do something different in systemd, it is probably best
to take the discussion to systemd-devel to make sure all the relevant
people are involved.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v1 5/5] driver-core: add driver asynchronous probe support

2014-10-02 Thread Tom Gundersen

On Tue, Sep 30, 2014 at 5:24 PM, Luis R. Rodriguez mcg...@suse.com wrote:
  commit e64fae5573e566ce4fd9b23c68ac8f3096603314
  Author: Kay Sievers kay.siev...@vrfy.org
  Date:   Wed Jan 18 05:06:18 2012 +0100
 
  udevd: kill hanging event processes after 30 seconds
 
  Some broken kernel drivers load firmware synchronously in the module 
  init
  path and block modprobe until the firmware request is fulfilled.
  ...

 This was a workaround to avoid a deadlock between udev and the kernel.
 The 180 s timeout was already in place before this change, and was not
 motivated by firmware loading. Also note that this patch was not about
 tracking device drivers, just about avoiding dead-lock.

 Thanks, can you elaborate on how a deadlock can occur if the kmod
 worker is not at some point sigkilled?

This was only relevant whet udev did the firmware loading. modprobe
would wait for the kernel, which would wait for the firmware loading,
which would wait for modprobe. This is no longer a problem as udev
does not do firmware loading any more.

 Is the issue that if there is no extra worker available and all are
 idling on sleep / synchronous long work boot will potentially hang
 unless a new worker becomes available to do more work?

Correct.

 If so I can
 see the sigkill helping for hanging tasks but it doesn't necessarily
 mean its a good idea to kill modules loading taking a while. Also
 what if the sigkill is just avoided for *just* kmod workers?

Depending on the number of devices you have, I suppose we could still
exhaust the workers.

 The way I see it, the current status from systemd's side is: our
 short-term work-around is to increase the timeout, and at the moment
 it appears no long-term solution is needed (i.e., it seems like the
 right thing to do is to make sure insmod can be near instantaneous, it
 appears people are working towards this goal, and so far no examples
 have cropped up showing that it is fundamentally impossible (once/if
 they do, we should of course revisit the problem)).

 That again would be reactive behaviour, what would prevent avoiding the
 sigkill only for kmod workers? Is it known the deadlock is immiment?
 If the amount of workers for kmod that would hit the timeout is
 considered low I don't see how that's possible and why not just lift
 the sigkill.

Making kmod a special case is of course possible. However, as long as
there is no fundamental reason why kmod should get this special
treatment, this just looks like a work-around to me. We already have a
work-around, which is to increase the global timeout. If you still
think we should do something different in systemd, it is probably best
to take the discussion to systemd-devel to make sure all the relevant
people are involved.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v1 5/5] driver-core: add driver asynchronous probe support

2014-09-30 Thread Tom Gundersen

On Tue, Sep 30, 2014 at 4:27 AM, Luis R. Rodriguez  wrote:
> On Sun, Sep 28, 2014 at 07:07:24PM +0200, Tom Gundersen wrote:
>> On Fri, Sep 26, 2014 at 11:57 PM, Luis R. Rodriguez
>>  wrote:
>> > From: "Luis R. Rodriguez" 
>> > Systemd has a general timeout for all workers currently set to 180
>> > seconds after which it will send a sigkill signal. Systemd now has a
>> > warning which is issued once it reaches 1/3 of the timeout. The original
>> > motivation for the systemd timeout was to help track device drivers
>> > which do not use asynch firmware loading on init() and the timeout was
>> > originally set to 30 seconds.
>>
>> Please note that the motivation for the timeout in systemd had nothing
>> to do with async firmware loading (that was just the case where
>> problems cropped up).
>
> *Part *of the original kill logic, according to the commit log, was actually
> due to the assumption that the issues observed *were* synchronous firmware
> loading on module init():
>
> commit e64fae5573e566ce4fd9b23c68ac8f3096603314
> Author: Kay Sievers 
> Date:   Wed Jan 18 05:06:18 2012 +0100
>
> udevd: kill hanging event processes after 30 seconds
>
> Some broken kernel drivers load firmware synchronously in the module init
> path and block modprobe until the firmware request is fulfilled.
> <...>

This was a workaround to avoid a deadlock between udev and the kernel.
The 180 s timeout was already in place before this change, and was not
motivated by firmware loading. Also note that this patch was not about
"tracking device drivers", just about avoiding dead-lock.

> My point here is not to point fingers but to explain why we went on with
> this and how we failed to realize only until later that the driver core
> ran probe together with init. When a few folks pointed out the issues
> with the kill the issue was punted back to kernel developers and the
> assumption even among some kernel maintainers was that it was init paths
> with sync behaviour that was causing some delays and they were broken
> drivers. It is important to highlight these assumptions ended up setting
> us off on the wrong path for a while in a hunt to try to fix this issue
> either in driver or elsewhere.

Ok. I'm not sure the motivations for user-space changes is important
to include in the commit message, but if you do I'll try to clarify
things to avoid misunderstandings.

> Thanks for clarifying this, can you explain what issues could arise
> from making an exception to allowing kmod workers to hang around
> completing init + probe over a certain defined amount of time without
> being killed?

We could run out of udev workers and the whole boot would hang.

The way I see it, the current status from systemd's side is: our
short-term work-around is to increase the timeout, and at the moment
it appears no long-term solution is needed (i.e., it seems like the
right thing to do is to make sure insmod can be near instantaneous, it
appears people are working towards this goal, and so far no examples
have cropped up showing that it is fundamentally impossible (once/if
they do, we should of course revisit the problem)).

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v1 5/5] driver-core: add driver asynchronous probe support

2014-09-30 Thread Tom Gundersen

On Tue, Sep 30, 2014 at 4:27 AM, Luis R. Rodriguez mcg...@suse.com wrote:
 On Sun, Sep 28, 2014 at 07:07:24PM +0200, Tom Gundersen wrote:
 On Fri, Sep 26, 2014 at 11:57 PM, Luis R. Rodriguez
 mcg...@do-not-panic.com wrote:
  From: Luis R. Rodriguez mcg...@suse.com
  Systemd has a general timeout for all workers currently set to 180
  seconds after which it will send a sigkill signal. Systemd now has a
  warning which is issued once it reaches 1/3 of the timeout. The original
  motivation for the systemd timeout was to help track device drivers
  which do not use asynch firmware loading on init() and the timeout was
  originally set to 30 seconds.

 Please note that the motivation for the timeout in systemd had nothing
 to do with async firmware loading (that was just the case where
 problems cropped up).

 *Part *of the original kill logic, according to the commit log, was actually
 due to the assumption that the issues observed *were* synchronous firmware
 loading on module init():

 commit e64fae5573e566ce4fd9b23c68ac8f3096603314
 Author: Kay Sievers kay.siev...@vrfy.org
 Date:   Wed Jan 18 05:06:18 2012 +0100

 udevd: kill hanging event processes after 30 seconds

 Some broken kernel drivers load firmware synchronously in the module init
 path and block modprobe until the firmware request is fulfilled.
 ...

This was a workaround to avoid a deadlock between udev and the kernel.
The 180 s timeout was already in place before this change, and was not
motivated by firmware loading. Also note that this patch was not about
tracking device drivers, just about avoiding dead-lock.

 My point here is not to point fingers but to explain why we went on with
 this and how we failed to realize only until later that the driver core
 ran probe together with init. When a few folks pointed out the issues
 with the kill the issue was punted back to kernel developers and the
 assumption even among some kernel maintainers was that it was init paths
 with sync behaviour that was causing some delays and they were broken
 drivers. It is important to highlight these assumptions ended up setting
 us off on the wrong path for a while in a hunt to try to fix this issue
 either in driver or elsewhere.

Ok. I'm not sure the motivations for user-space changes is important
to include in the commit message, but if you do I'll try to clarify
things to avoid misunderstandings.

 Thanks for clarifying this, can you explain what issues could arise
 from making an exception to allowing kmod workers to hang around
 completing init + probe over a certain defined amount of time without
 being killed?

We could run out of udev workers and the whole boot would hang.

The way I see it, the current status from systemd's side is: our
short-term work-around is to increase the timeout, and at the moment
it appears no long-term solution is needed (i.e., it seems like the
right thing to do is to make sure insmod can be near instantaneous, it
appears people are working towards this goal, and so far no examples
have cropped up showing that it is fundamentally impossible (once/if
they do, we should of course revisit the problem)).

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v1 5/5] driver-core: add driver asynchronous probe support

2014-09-28 Thread Tom Gundersen

Hi Luis,

Thanks for the patches and the detailed analysis.

Feel free to add

Acked-by: Tom Gundersen 

Minor comments on the commit message below.

On Fri, Sep 26, 2014 at 11:57 PM, Luis R. Rodriguez
 wrote:
> From: "Luis R. Rodriguez" 
>
> Some init systems may wish to express the desire to have
> device drivers run their device driver's bus probe() run
> asynchronously. This implements support for this and
> allows userspace to request async probe as a preference
> through a generic shared device driver module parameter,
> async_probe. Implemention for async probe is supported
> through a module parameter given that since synchronous
> probe has been prevalent for years some userspace might
> exist which relies on the fact that the device driver will
> probe synchronously and the assumption that devices it
> provides will be immediately available after this.
>
> Some device driver might not be able to run async probe
> so we enable device drivers to annotate this to prevent
> this module parameter from having any effect on them.
>
> This implementation uses queue_work(system_unbound_wq)
> to queue async probes, this should enable probe to run
> slightly *faster* if the driver's probe path did not
> have much interaction with other workqueues otherwise
> it may run _slightly_ slower. Tests were done with cxgb4,
> which is known to take long on probe, both without
> having to run request_firmware() [0] and then by
> requiring it to use request_firmware() [1]. The
> difference in run time are only measurable in microseconds:
>
> =|
> strategyfw (usec)   no-fw (usec) |
> -|
> synchronous 244725691307563  |
> kthread 25066415.5  1309868.5|
> queue_work(system_unbound_wq)   24913661.5  1307631  |
> -|
>
> In practice, in seconds, the difference is barely noticeable:
>
> =|
> strategyfw (s)  no-fw (s)|
> -|
> synchronous 24.47   1.31 |
> kthread 25.07   1.31 |
> queue_work(system_unbound_wq)   24.91   1.31 |
> -|
>
> [0] 
> http://ftp.suse.com/pub/people/mcgrof/async-probe/probe-cgxb4-no-firmware.png
> [1] http://ftp.suse.com/pub/people/mcgrof/async-probe/probe-cgxb4-firmware.png
>
> The rest of the commit log documents why this feature was implemented
> primarily first for systemd and things it should consider next.
>
> Systemd has a general timeout for all workers currently set to 180
> seconds after which it will send a sigkill signal. Systemd now has a
> warning which is issued once it reaches 1/3 of the timeout. The original
> motivation for the systemd timeout was to help track device drivers
> which do not use asynch firmware loading on init() and the timeout was
> originally set to 30 seconds.

Please note that the motivation for the timeout in systemd had nothing
to do with async firmware loading (that was just the case where
problems cropped up). The motivation was to not allow udev-workers to
stay around indefinitely, and hence put an upper-bound on
their duration (initially 180 s). At some point the bound was reduced
to 30 seconds to make sure module-loading would bail out before the
kernel's firmware loading timeout would bail out (60s I believe). That
is no longer relevant, which is why it was safe to reset the timeout
to 180 s.

> Since systemd + kernel are heavily tied in for the purposes of this
> patch it is assumed you have merged on systemd the following
> commits:
>
> 671174136525ddf208cdbe75d6d6bd159afa961fudev: timeout - warn after a 
> third of the timeout before killing
> b5338a19864ac3f5632aee48069a669479621dcaudev: timeout - increase 
> timeout
> 2e92633dbae52f5ac9b7b2e068935990d475d2cdudev: bump event timeout to 
> 60 seconds
> be2ea723b1d023b3d385d3b791ee4607cbfb20caudev: remove userspace 
> firmware loading support
> 9f20a8a376f924c8eb5423cfc1f98644fc1e2d1audev: fixup commit
> dd5eddd28a74a49607a8fffcaf960040dba98479udev: unify event timeout 
> handling
> 9719859c07aa13539ed2cd4b31972cd30f678543udevd: add --event-timeout 
> commandline option
>
> Since we bundle together serially driver ini

Re: [PATCH v1 5/5] driver-core: add driver asynchronous probe support

2014-09-28 Thread Tom Gundersen

Hi Luis,

Thanks for the patches and the detailed analysis.

Feel free to add

Acked-by: Tom Gundersen t...@jklm.no

Minor comments on the commit message below.

On Fri, Sep 26, 2014 at 11:57 PM, Luis R. Rodriguez
mcg...@do-not-panic.com wrote:
 From: Luis R. Rodriguez mcg...@suse.com

 Some init systems may wish to express the desire to have
 device drivers run their device driver's bus probe() run
 asynchronously. This implements support for this and
 allows userspace to request async probe as a preference
 through a generic shared device driver module parameter,
 async_probe. Implemention for async probe is supported
 through a module parameter given that since synchronous
 probe has been prevalent for years some userspace might
 exist which relies on the fact that the device driver will
 probe synchronously and the assumption that devices it
 provides will be immediately available after this.

 Some device driver might not be able to run async probe
 so we enable device drivers to annotate this to prevent
 this module parameter from having any effect on them.

 This implementation uses queue_work(system_unbound_wq)
 to queue async probes, this should enable probe to run
 slightly *faster* if the driver's probe path did not
 have much interaction with other workqueues otherwise
 it may run _slightly_ slower. Tests were done with cxgb4,
 which is known to take long on probe, both without
 having to run request_firmware() [0] and then by
 requiring it to use request_firmware() [1]. The
 difference in run time are only measurable in microseconds:

 =|
 strategyfw (usec)   no-fw (usec) |
 -|
 synchronous 244725691307563  |
 kthread 25066415.5  1309868.5|
 queue_work(system_unbound_wq)   24913661.5  1307631  |
 -|

 In practice, in seconds, the difference is barely noticeable:

 =|
 strategyfw (s)  no-fw (s)|
 -|
 synchronous 24.47   1.31 |
 kthread 25.07   1.31 |
 queue_work(system_unbound_wq)   24.91   1.31 |
 -|

 [0] 
 http://ftp.suse.com/pub/people/mcgrof/async-probe/probe-cgxb4-no-firmware.png
 [1] http://ftp.suse.com/pub/people/mcgrof/async-probe/probe-cgxb4-firmware.png

 The rest of the commit log documents why this feature was implemented
 primarily first for systemd and things it should consider next.

 Systemd has a general timeout for all workers currently set to 180
 seconds after which it will send a sigkill signal. Systemd now has a
 warning which is issued once it reaches 1/3 of the timeout. The original
 motivation for the systemd timeout was to help track device drivers
 which do not use asynch firmware loading on init() and the timeout was
 originally set to 30 seconds.

Please note that the motivation for the timeout in systemd had nothing
to do with async firmware loading (that was just the case where
problems cropped up). The motivation was to not allow udev-workers to
stay around indefinitely, and hence put an upper-bound on
their duration (initially 180 s). At some point the bound was reduced
to 30 seconds to make sure module-loading would bail out before the
kernel's firmware loading timeout would bail out (60s I believe). That
is no longer relevant, which is why it was safe to reset the timeout
to 180 s.

 Since systemd + kernel are heavily tied in for the purposes of this
 patch it is assumed you have merged on systemd the following
 commits:

 671174136525ddf208cdbe75d6d6bd159afa961fudev: timeout - warn after a 
 third of the timeout before killing
 b5338a19864ac3f5632aee48069a669479621dcaudev: timeout - increase 
 timeout
 2e92633dbae52f5ac9b7b2e068935990d475d2cdudev: bump event timeout to 
 60 seconds
 be2ea723b1d023b3d385d3b791ee4607cbfb20caudev: remove userspace 
 firmware loading support
 9f20a8a376f924c8eb5423cfc1f98644fc1e2d1audev: fixup commit
 dd5eddd28a74a49607a8fffcaf960040dba98479udev: unify event timeout 
 handling
 9719859c07aa13539ed2cd4b31972cd30f678543udevd: add --event-timeout 
 commandline option

 Since we bundle together serially driver init() and probe()
 on module initialiation systemd's imposed timeout  put a limit on the
 amount of time a driver init() and probe routines can take. There's a
 few overlooked issues with this and the timeout in general:

 0) Not all drivers are killed, the signal is just sent

Re: [systemd-devel] [RFC v2 3/6] kthread: warn on kill signal if not OOM

2014-09-11 Thread Tom Gundersen

On Fri, Sep 12, 2014 at 12:26 AM, Luis R. Rodriguez
 wrote:
> On Thu, Sep 11, 2014 at 2:43 PM, Tom Gundersen  wrote:
>> How about simply introducing a new flag to finit_module() to indicate
>> that the caller does not care about asynchronicity. We could then pass
>> this from udev, but existing scripts calling modprobe/insmod will not
>> be affected.
>
> Do you mean that you *do want asynchronicity*?

Precisely, udev would opt-in, but existing scripts etc would not.

>> But isn't finit_module() taking a long time a serious problem given
>> that it means no other module can be loaded in parallel?
>
> Indeed but having a desire to make the init() complete fast is
> different than the desire to have the combination of both init and
> probe fast synchronously.

I guess no one is arguing that probe should somehow be required to be
fast, but rather:

> If userspace wants init to be fast and let
> probe be async then userspace has no option but to deal with the fact
> that async probe will be async, and it should then use other methods
> to match any dependencies if its doing that itself.

Correct. And this therefore likely needs to be opt-in behaviour per
finit_module() invocation to avoid breaking old assumptions.

> For example
> networking should not kick off after a network driver is loaded but
> rather one the device creeps up on udev. We should be good with
> networking dealing with this correctly today but not sure about other
> subsystems. depmod should be able to load the required modules in
> order and if bus drivers work right then probe of the remnant devices
> should happen asynchronously. The one case I can think of that is a
> bit different is modules-load.d things but those *do not rely on the
> timeout*, but are loaded prior to a service requirement. Note though
> that if those modules had probe and they then run async'd then systemd
> service would probably need to consider that the requirements may not
> be there until later. If this is not carefully considered that could
> introduce regression to users of modules-load.d when async probe is
> fully deployed. The same applies to systemd making assumptions of kmod
> loading a module and a dependency being complete as probe would have
> run it before.

Yeah, these all needs to be considered when deciding whether or not to
enable async in each specific case.

> I believe one concern here lies in on whether or not userspace
> is properly equipped to deal with the requirements on module loading
> doing async probing and that possibly failing. Perhaps systemd might
> think all userspace is ready for that but are we sure that's the case?

There almost certainly are custom things out there relying on the
synchronous behaviour, but if we make it opt-in we should not have a
problem.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [systemd-devel] [RFC v2 3/6] kthread: warn on kill signal if not OOM

2014-09-11 Thread Tom Gundersen

On Wed, Sep 10, 2014 at 11:10 PM, Luis R. Rodriguez
 wrote:
>>> More than two years
>>> have gone by on growing design and assumptions on top of that original
>>> commit. I'm not sure if *systemd folks* yet believe its was a design
>>> regression?
>>
>> I don't think so. udev should not allow its workers to run for an
>> unbounded length of time. Whether the upper bound should be 30, 60,
>> 180 seconds or something else is up for debate (currently it is 60,
>> but if that is too short for some drivers we could certainly revisit
>> that).
>
> That's the thing -- the timeout was put in place under the assumption
> probe was asyncronous and its not, the driver core issues both module
> init *and* probe together, the loader has to wait. That alone makes
> the timeout a design flaw, and then systemd carried on top of that
> design over two years after that. Its not systemd's fault, its just
> that we never spoke about this as a design thing broadly and we should
> have, and I will mention that even when the first issues creeped up,
> the issue was still tossed back a driver problems. It was only until
> recently that we realized that both init and probe run together that
> we've been thinking about this problem differently. Systemd was trying
> to ensure init on driver don't take long but its not init that is
> taking long, its probe, and probe gets then penalized as the driver
> core batches both init and probe synchronously before finishing module
> loading.

Just to clarify: udev/systemd is not trying to get into the business
of what the kernel does on finit_module(), we just need to make sure
that none of our workers stay around forever, which is why we have a
global timeout. If necessary we can bump this higher (as mentioned in
another thread I just bumped it to 180 secs), but we cannot abolish it
entirely.

> Furthermore as clarified by Tejun random userland is known to
> exist that will wait indefinitely for module loading under the simple
> assumption things *are done synchronously*, and its precisely why we
> can't just blindly enable async probe upstream through a new driver
> boolean as it can be unfair to this old userland. What is being
> evaluated is to enable aync probe for *all* drivers through a new
> general system-wide option. We cannot regress old userspace and
> assumptions but we can create a new shiny universe.

How about simply introducing a new flag to finit_module() to indicate
that the caller does not care about asynchronicity. We could then pass
this from udev, but existing scripts calling modprobe/insmod will not
be affected.

>> Moreover, it seems from this discussion that the aim is (still)
>> that insmod should be near-instantaneous (i.e., not wait for probe),
>
> The only reason that is being discussed is that systemd has not
> accepted the timeout as a system design despite me pointing out the
> original design flaw recently and at this point even if was accepted
> as a design flaw it seems its too late. The approach taken to help
> make all drivers async is simply an afterthought to give systemd what
> it *thought* was in place, and it by no measure should be considered
> the proper fix to the regression introduced by systemd, it may perhaps
> be the right step long term for systemd systems given it goes with
> what it assumed was there, but the timeout was flawed. Its not clear
> if systemd can help with old kernels, it seems the ship has sailed and
> there seems no options but for folks to work around that -- unless of
> course some reasonable solution is found which doesn't break current
> systemd design?

If I read the git logs correctly the hard timeout was introduced in
April 2011, so reverting it now seems indeed not to help much with all
the running systems out there.

> As part of this series I addressed hunting for the  "misbehaving
> drivers" in-kernel as I saw no progress on the systemd side of things
> to non-fatally detect "misbehaving drivers" despite my original RFCs
> and request for advice. I quote  "misbehaving drivers" as its a flawed
> view to consider them misbehaving now in light of clarifications of
> how the driver core works in that it batches both init and probe
> together always and we can't be penalizing long probes due to the fact
> long probes are simply fine. My patch to pick up "misbehaving drivers"
> drivers on the kernel front by picking up systemd's signal was
> reactive but it was also simply braindead given the same exact reasons
> why systemd's original timeout was flawed. We want a general solution
> and we don't want to work around the root cause, in this case it was
> systemd's assumption on how drivers work.

Would your ongoing work to make probing asynchronous solve this
problem in the long-term? In the short-term I guess bumping the udev
timeout should be sufficient.

> Keep in mind that the above just addresses kmod built-in cmd on
> systemd, its where the timeout was introduced but as has been
> clarified here assuming the same

Re: [systemd-devel] [RFC v2 3/6] kthread: warn on kill signal if not OOM

2014-09-11 Thread Tom Gundersen

On Wed, Sep 10, 2014 at 11:10 PM, Luis R. Rodriguez
mcg...@do-not-panic.com wrote:
 More than two years
 have gone by on growing design and assumptions on top of that original
 commit. I'm not sure if *systemd folks* yet believe its was a design
 regression?

 I don't think so. udev should not allow its workers to run for an
 unbounded length of time. Whether the upper bound should be 30, 60,
 180 seconds or something else is up for debate (currently it is 60,
 but if that is too short for some drivers we could certainly revisit
 that).

 That's the thing -- the timeout was put in place under the assumption
 probe was asyncronous and its not, the driver core issues both module
 init *and* probe together, the loader has to wait. That alone makes
 the timeout a design flaw, and then systemd carried on top of that
 design over two years after that. Its not systemd's fault, its just
 that we never spoke about this as a design thing broadly and we should
 have, and I will mention that even when the first issues creeped up,
 the issue was still tossed back a driver problems. It was only until
 recently that we realized that both init and probe run together that
 we've been thinking about this problem differently. Systemd was trying
 to ensure init on driver don't take long but its not init that is
 taking long, its probe, and probe gets then penalized as the driver
 core batches both init and probe synchronously before finishing module
 loading.

Just to clarify: udev/systemd is not trying to get into the business
of what the kernel does on finit_module(), we just need to make sure
that none of our workers stay around forever, which is why we have a
global timeout. If necessary we can bump this higher (as mentioned in
another thread I just bumped it to 180 secs), but we cannot abolish it
entirely.

 Furthermore as clarified by Tejun random userland is known to
 exist that will wait indefinitely for module loading under the simple
 assumption things *are done synchronously*, and its precisely why we
 can't just blindly enable async probe upstream through a new driver
 boolean as it can be unfair to this old userland. What is being
 evaluated is to enable aync probe for *all* drivers through a new
 general system-wide option. We cannot regress old userspace and
 assumptions but we can create a new shiny universe.

How about simply introducing a new flag to finit_module() to indicate
that the caller does not care about asynchronicity. We could then pass
this from udev, but existing scripts calling modprobe/insmod will not
be affected.

 Moreover, it seems from this discussion that the aim is (still)
 that insmod should be near-instantaneous (i.e., not wait for probe),

 The only reason that is being discussed is that systemd has not
 accepted the timeout as a system design despite me pointing out the
 original design flaw recently and at this point even if was accepted
 as a design flaw it seems its too late. The approach taken to help
 make all drivers async is simply an afterthought to give systemd what
 it *thought* was in place, and it by no measure should be considered
 the proper fix to the regression introduced by systemd, it may perhaps
 be the right step long term for systemd systems given it goes with
 what it assumed was there, but the timeout was flawed. Its not clear
 if systemd can help with old kernels, it seems the ship has sailed and
 there seems no options but for folks to work around that -- unless of
 course some reasonable solution is found which doesn't break current
 systemd design?

If I read the git logs correctly the hard timeout was introduced in
April 2011, so reverting it now seems indeed not to help much with all
the running systems out there.

 As part of this series I addressed hunting for the  misbehaving
 drivers in-kernel as I saw no progress on the systemd side of things
 to non-fatally detect misbehaving drivers despite my original RFCs
 and request for advice. I quote  misbehaving drivers as its a flawed
 view to consider them misbehaving now in light of clarifications of
 how the driver core works in that it batches both init and probe
 together always and we can't be penalizing long probes due to the fact
 long probes are simply fine. My patch to pick up misbehaving drivers
 drivers on the kernel front by picking up systemd's signal was
 reactive but it was also simply braindead given the same exact reasons
 why systemd's original timeout was flawed. We want a general solution
 and we don't want to work around the root cause, in this case it was
 systemd's assumption on how drivers work.

Would your ongoing work to make probing asynchronous solve this
problem in the long-term? In the short-term I guess bumping the udev
timeout should be sufficient.

 Keep in mind that the above just addresses kmod built-in cmd on
 systemd, its where the timeout was introduced but as has been
 clarified here assuming the same timeout on *all* other built-in
 likely is likely pretty flawed as

Re: [systemd-devel] [RFC v2 3/6] kthread: warn on kill signal if not OOM

2014-09-11 Thread Tom Gundersen

On Fri, Sep 12, 2014 at 12:26 AM, Luis R. Rodriguez
mcg...@do-not-panic.com wrote:
 On Thu, Sep 11, 2014 at 2:43 PM, Tom Gundersen t...@jklm.no wrote:
 How about simply introducing a new flag to finit_module() to indicate
 that the caller does not care about asynchronicity. We could then pass
 this from udev, but existing scripts calling modprobe/insmod will not
 be affected.

 Do you mean that you *do want asynchronicity*?

Precisely, udev would opt-in, but existing scripts etc would not.

 But isn't finit_module() taking a long time a serious problem given
 that it means no other module can be loaded in parallel?

 Indeed but having a desire to make the init() complete fast is
 different than the desire to have the combination of both init and
 probe fast synchronously.

I guess no one is arguing that probe should somehow be required to be
fast, but rather:

 If userspace wants init to be fast and let
 probe be async then userspace has no option but to deal with the fact
 that async probe will be async, and it should then use other methods
 to match any dependencies if its doing that itself.

Correct. And this therefore likely needs to be opt-in behaviour per
finit_module() invocation to avoid breaking old assumptions.

 For example
 networking should not kick off after a network driver is loaded but
 rather one the device creeps up on udev. We should be good with
 networking dealing with this correctly today but not sure about other
 subsystems. depmod should be able to load the required modules in
 order and if bus drivers work right then probe of the remnant devices
 should happen asynchronously. The one case I can think of that is a
 bit different is modules-load.d things but those *do not rely on the
 timeout*, but are loaded prior to a service requirement. Note though
 that if those modules had probe and they then run async'd then systemd
 service would probably need to consider that the requirements may not
 be there until later. If this is not carefully considered that could
 introduce regression to users of modules-load.d when async probe is
 fully deployed. The same applies to systemd making assumptions of kmod
 loading a module and a dependency being complete as probe would have
 run it before.

Yeah, these all needs to be considered when deciding whether or not to
enable async in each specific case.

 I believe one concern here lies in on whether or not userspace
 is properly equipped to deal with the requirements on module loading
 doing async probing and that possibly failing. Perhaps systemd might
 think all userspace is ready for that but are we sure that's the case?

There almost certainly are custom things out there relying on the
synchronous behaviour, but if we make it opt-in we should not have a
problem.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [systemd-devel] [RFC v2 3/6] kthread: warn on kill signal if not OOM

2014-09-10 Thread Tom Gundersen

On Tue, Sep 9, 2014 at 10:45 PM, Luis R. Rodriguez
 wrote:
> On Tue, Sep 9, 2014 at 12:35 PM, James Bottomley
>  wrote:
>> On Tue, 2014-09-09 at 12:16 -0700, Luis R. Rodriguez wrote:
>>> On Mon, Sep 8, 2014 at 10:38 PM, James Bottomley
>>>  wrote:
>>> > If we want to sort out some sync/async mechanism for probing devices, as
>>> > an agreement between the init systems and the kernel, that's fine, but
>>> > its a to-be negotiated enhancement.
>>>
>>> Unfortunately as Tejun notes the train has left which already made
>>> assumptions on this.
>>
>> Well, that's why it's a bug.  It's a material regression impacting
>> users.
>
> Indeed. I believe the issue with this regression however was that the
> original commit e64fae55 (January 2012) was only accepted by *kernel
> folks* to be a real regression until recently.

Just for the record, this only caused user-visible problems after
kernel commit 786235ee (November 2013), right?

> More than two years
> have gone by on growing design and assumptions on top of that original
> commit. I'm not sure if *systemd folks* yet believe its was a design
> regression?

I don't think so. udev should not allow its workers to run for an
unbounded length of time. Whether the upper bound should be 30, 60,
180 seconds or something else is up for debate (currently it is 60,
but if that is too short for some drivers we could certainly revisit
that). Moreover, it seems from this discussion that the aim is (still)
that insmod should be near-instantaneous (i.e., not wait for probe),
so it seems to me that the basic design is correct and all we need is
some temporary work-around and a way to better detect misbehaving
drivers?

>>>  I'm afraid distributions that want to avoid this
>>> sigkill at least on the kernel front will have to work around this
>>> issue either on systemd by increasing the default timeout which is now
>>> possible thanks to Hannes' changes or by some other means such as the
>>> combination of a modified non-chatty version of this patch + a check
>>> at the end of load_module() as mentioned earlier on these threads.
>>
>> Increasing the default timeout in systemd seems like the obvious bug fix
>> to me.  If the patch exists already, having distros that want it use it
>> looks to be correct ... not every bug is a kernel bug, after all.
>
> Its merged upstream on systemd now, along with a few fixes on top of
> it. I also see Kay merged a change to the default timeout to 60 second
> on August 30. Its unclear if these discussions had any impact on that
> decision or if that was just because udev firmware loading got now
> ripped out. I'll note that the new 60 second timeout wouldn't suffice
> for cxgb4 even if it didn't do firmware loading, its probe takes over
> one full minute.
>
>> Negotiating a probe vs init split for drivers is fine too, but it's a
>> longer term thing rather than a bug fix.
>
> Indeed. What I proposed with a multiplier for the timeout for the
> different types of built in commands was deemed complex but saw no
> alternatives proposed despite my interest to work on one and
> clarifications noted that this was a design regression. Not quite sure
> what else I could have done here. I'm interested in learning what the
> better approach is for the future as if we want to marry init + kernel
> we need a smooth way for us to discuss design without getting worked
> up about it, or taking it personal. I really want this to work as I
> personally like systemd so far.

How about this: keep the timeout global, but also introduce a
(relatively short, say 10 or 15 seconds) timeout after which a warning
is printed. Even if nothing is actually killed, having workers (be it
insmod or something else) take longer than a couple of seconds is
likely a sign that something is seriously off somewhere.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [systemd-devel] [RFC v2 3/6] kthread: warn on kill signal if not OOM

2014-09-10 Thread Tom Gundersen

On Tue, Sep 9, 2014 at 10:45 PM, Luis R. Rodriguez
mcg...@do-not-panic.com wrote:
 On Tue, Sep 9, 2014 at 12:35 PM, James Bottomley
 james.bottom...@hansenpartnership.com wrote:
 On Tue, 2014-09-09 at 12:16 -0700, Luis R. Rodriguez wrote:
 On Mon, Sep 8, 2014 at 10:38 PM, James Bottomley
 james.bottom...@hansenpartnership.com wrote:
  If we want to sort out some sync/async mechanism for probing devices, as
  an agreement between the init systems and the kernel, that's fine, but
  its a to-be negotiated enhancement.

 Unfortunately as Tejun notes the train has left which already made
 assumptions on this.

 Well, that's why it's a bug.  It's a material regression impacting
 users.

 Indeed. I believe the issue with this regression however was that the
 original commit e64fae55 (January 2012) was only accepted by *kernel
 folks* to be a real regression until recently.

Just for the record, this only caused user-visible problems after
kernel commit 786235ee (November 2013), right?

 More than two years
 have gone by on growing design and assumptions on top of that original
 commit. I'm not sure if *systemd folks* yet believe its was a design
 regression?

I don't think so. udev should not allow its workers to run for an
unbounded length of time. Whether the upper bound should be 30, 60,
180 seconds or something else is up for debate (currently it is 60,
but if that is too short for some drivers we could certainly revisit
that). Moreover, it seems from this discussion that the aim is (still)
that insmod should be near-instantaneous (i.e., not wait for probe),
so it seems to me that the basic design is correct and all we need is
some temporary work-around and a way to better detect misbehaving
drivers?

  I'm afraid distributions that want to avoid this
 sigkill at least on the kernel front will have to work around this
 issue either on systemd by increasing the default timeout which is now
 possible thanks to Hannes' changes or by some other means such as the
 combination of a modified non-chatty version of this patch + a check
 at the end of load_module() as mentioned earlier on these threads.

 Increasing the default timeout in systemd seems like the obvious bug fix
 to me.  If the patch exists already, having distros that want it use it
 looks to be correct ... not every bug is a kernel bug, after all.

 Its merged upstream on systemd now, along with a few fixes on top of
 it. I also see Kay merged a change to the default timeout to 60 second
 on August 30. Its unclear if these discussions had any impact on that
 decision or if that was just because udev firmware loading got now
 ripped out. I'll note that the new 60 second timeout wouldn't suffice
 for cxgb4 even if it didn't do firmware loading, its probe takes over
 one full minute.

 Negotiating a probe vs init split for drivers is fine too, but it's a
 longer term thing rather than a bug fix.

 Indeed. What I proposed with a multiplier for the timeout for the
 different types of built in commands was deemed complex but saw no
 alternatives proposed despite my interest to work on one and
 clarifications noted that this was a design regression. Not quite sure
 what else I could have done here. I'm interested in learning what the
 better approach is for the future as if we want to marry init + kernel
 we need a smooth way for us to discuss design without getting worked
 up about it, or taking it personal. I really want this to work as I
 personally like systemd so far.

How about this: keep the timeout global, but also introduce a
(relatively short, say 10 or 15 seconds) timeout after which a warning
is printed. Even if nothing is actually killed, having workers (be it
insmod or something else) take longer than a couple of seconds is
likely a sign that something is seriously off somewhere.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC v2 3/6] kthread: warn on kill signal if not OOM

2014-09-09 Thread Tom Gundersen

On Tue, Sep 9, 2014 at 3:26 AM, Luis R. Rodriguez
 wrote:
> On Mon, Sep 8, 2014 at 6:22 PM, Tejun Heo  wrote:
>> On Tue, Sep 09, 2014 at 10:10:59AM +0900, Tejun Heo wrote:
>>> I'm not too convinced this is such a difficult problem to figure out.
>>> We already have most of logic in place and the only thing missing is
>>> how to switch it.  Wouldn't something like the following work?
>>>
>>> * Add a sysctl knob to enable asynchronous device probing on module
>>>   load and enable asynchronous probing globally if the knob is set.
>>
>> Alternatively, add a module-generic param "async_probe" or whatever
>> and use that to switch the behavior should work too.  I don't know
>> which way is better but either should work fine.
>
> I take it by this you meant a generic system-wide sysctl or kernel cmd
> line option to enable this for al drivers?

If the expectation is that this feature should be enabled
unconditionally for all systemd systems, wouldn't it make more sense
to make it a Kconfig option (possibly overridable from the kernel
commandline in case that makes testing simpler)?

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC v2 3/6] kthread: warn on kill signal if not OOM

2014-09-09 Thread Tom Gundersen

On Tue, Sep 9, 2014 at 3:26 AM, Luis R. Rodriguez
mcg...@do-not-panic.com wrote:
 On Mon, Sep 8, 2014 at 6:22 PM, Tejun Heo t...@kernel.org wrote:
 On Tue, Sep 09, 2014 at 10:10:59AM +0900, Tejun Heo wrote:
 I'm not too convinced this is such a difficult problem to figure out.
 We already have most of logic in place and the only thing missing is
 how to switch it.  Wouldn't something like the following work?

 * Add a sysctl knob to enable asynchronous device probing on module
   load and enable asynchronous probing globally if the knob is set.

 Alternatively, add a module-generic param async_probe or whatever
 and use that to switch the behavior should work too.  I don't know
 which way is better but either should work fine.

 I take it by this you meant a generic system-wide sysctl or kernel cmd
 line option to enable this for al drivers?

If the expectation is that this feature should be enabled
unconditionally for all systemd systems, wouldn't it make more sense
to make it a Kconfig option (possibly overridable from the kernel
commandline in case that makes testing simpler)?

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: dell rbu driver and FW_LOADER_USER_HELPER

2014-08-26 Thread Tom Gundersen

On Tue, Aug 26, 2014 at 7:14 PM, Luis R. Rodriguez  wrote:
> On Tue, Aug 26, 2014 at 07:09:46PM +0200, Luis R. Rodriguez wrote:
>> When Christian tried removing the udev firmware loader
>> it was detected that the Dell rbu driver still required
>> use of FW_LOADER_USER_HELPER [0]. This driver needed
>> reworking to not be tied to the firmware loader, curious
>> if anyone has been working on that.

The rework is now upstream, but not yet released [1].

>> Is the current
>> implementation depending on that userspace identifies
>> the required path to the file ? Why can't standard
>> /lib/firmware/ paths be used ?

The dell driver dose not use rely on udev in userspace, nor the
/lib/firmware path. It has its own userspace component, which just
happens to reuse some of the infrastructure the udev firmware helper
used to use. This has now been split up, and as of 3.17 the firmware
loader can safely be removed from udev.

>> [0] http://lists.freedesktop.org/archives/systemd-devel/2014-May/019631.html
>
> Well abhay_salu...@dell.com bounces. Who at Dell is maintaining this
> driver now? Maybe Mario can help finding the right person?
>
>   Luis

[1]: 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: dell rbu driver and FW_LOADER_USER_HELPER

2014-08-26 Thread Tom Gundersen

On Tue, Aug 26, 2014 at 7:14 PM, Luis R. Rodriguez mcg...@suse.com wrote:
On Tue, Aug 26, 2014 at 07:09:46PM +0200, Luis R. Rodriguez wrote:
When Christian tried removing the udev firmware loader
it was detected that the Dell rbu driver still required
use of FW_LOADER_USER_HELPER [0]. This driver needed
reworking to not be tied to the firmware loader, curious
if anyone has been working on that.

The rework is now upstream, but not yet released [1].

Is the current
implementation depending on that userspace identifies
the required path to the file ? Why can't standard
/lib/firmware/ paths be used ?

The dell driver dose not use rely on udev in userspace, nor the
/lib/firmware path. It has its own userspace component, which just
happens to reuse some of the infrastructure the udev firmware helper
used to use. This has now been split up, and as of 3.17 the firmware
loader can safely be removed from udev.

[0] http://lists.freedesktop.org/archives/systemd-devel/2014-May/019631.html

Well abhay_salu...@dell.com bounces. Who at Dell is maintaining this
driver now? Maybe Mario can help finding the right person?

Luis

[1]:
http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=5a1379e8748a5cfa3eb068f812d61bde849ef76c
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Re: [PATCH] Hyperv: Trigger DHCP renew after host hibernation

2014-08-13 Thread Tom Gundersen

On Tue, Aug 12, 2014 at 10:29 AM, Dexuan Cui  wrote:
>> From: Tom Gundersen
>> > Unluckily this logic doesn't work because the user-space daemons
>> > like ifplugd, usually don't renew the DHCP immediately as long as they
>> > receive a link-down message: they usually wait for some seconds and if
>> > they find the link becomes up soon, they won't trigger renew operations.
>> > (I guess this behavior can be somewhat reasonable: maybe the daemons
>> > try to not trigger DHCP renew on temporary link instability)
>> >
>> networkd does not suffer from this problem, and in ifplugd it can be
> I didn't have time to check the code of networkd, but I think it does have the
> same behavior.
> e.g., on a bare metal host with Ubuntu 14.04, when I plug the RJ45 cable out
> of the network card and then plug the cable back into the network card
> quickly -- in ~3 seconds, networkd doesn't trigger DHCP renew request: in
> /var/log/syslog, we see
> Aug 12 11:07:07 decui-lin NetworkManager[828]:  (eth0): carrier now OFF 
> (device state 100, deferring action for 4 seconds)
> Aug 12 11:07:07 decui-lin kernel: [  246.975453] e1000e: eth0 NIC Link is Down
> Aug 12 11:07:10 decui-lin NetworkManager[828]:  (eth0): carrier now ON 
> (device state 100)
> Aug 12 11:07:10 decui-lin kernel: [  250.028533] e1000e: eth0 NIC Link is Up 
> 100 Mbps Full Duplex, Flow Control: Rx/Tx
>
> It looks there is a delay of 4s.
> I'm going to find out if there is a configurable parameter for this.

Just to avoid any confusion: you are referring to "networkd" (and so
did my comments), but the above logs are from "NetworkManager".

>> disabled. Most other network drivers will send
>> IFF_LOWER_DOWN/IFF_LOWER_UP upon suspend/resume so if you were to
>> do the same you will not work any worse than the others. What would be
> suspend/resume(ACPI S3?) is different  as this is usually > 4 seconds?  :-)

Sure, but that's clearly not something we should rely on.

>> nice, as mentioned by Dan and Lennart, would be to send an additional
>> explicit event such as "resumed from suspend" or "L3 may be wrong".
> Sorry, I neglected to reply this.
> IMHO even if we add the new event, we still need lots of efforts to
> make the various userland daemons(ifplugd, networkd, etc) to use the
> new event.
> And looks we're the first user of this new event. I'm not sure if this issue
> here can convince the network subsystem maintainers such a new event
> is a must.
>
>> That should be a generic thing though, to fix all such issues in one
>> go.
> I agree, though this requires we update all the userland daemons...
>
>> > I'm not sure our attempt to "fix" the daemons can be easily accepted.
>> > BTW, by CPUID, an application has a reliable way to determine  if it's
>> > running on hyper-v on not. Maybe we can "fix" the behavior of the
>> > daemons when they run on hyper-v?
>> > BTW2, according to my limited experience, I doubt other VMMs can
>> > handle this auto-DHCP-renew-in-guest issue properly.
>>
>> To the extent this is a problem, it is a generic one, so we should not
>> need any hyper-v specific logic in userspace.
> OK, I understood.
>
>> > That was why Yue's patch wanted to add a SLEEP(10s) between the
>> > link-down and link-up events and hoped this could be an acceptable
>> > fix(while it turned out not, obviously), though we admit it's not so good
>> > to add such a magic number "10s" in a kernel driver.
>> >
>> > Please point it out if I missed or misunderstand something.
>> >
>> > Now I understand it's not good to pass the event to the udev daemon,
>> > and it's not good to use a SLEEP(10s) in the kernel space(even if it's in a
>> > "work" task here).
>>
>> Please just expose to userspace what is happening (link lost/gained,
>> resumed from suspend/...), and let us sort out how to react to that.
>> If you put assumptions about what kind of timeouts (long-dead)
>> userspace uses into your drivers you'll just create a mess.
> OK, I got it now.
> So I think I'm supposed to send out a netif_carrier_off()/on() patch,
> and I'd better do this after I verify the daemons can work with
> proper parameters specified.
>
>> > Please let me know if it's the correct direction to fix the user-space
>> > daemons (ifplugd, systemd-networkd, etc).
>> > If you think this is viable and we should do this, I'll submit a
>> > netif_carrier_off/on patch first and will start to work with the
>> > projects of ifplugd, systemd-networkd and many OSVs to make the
>> > while thing work eventually.
>>
>> Have you actually checked that carrier_off/on does not work on
>> anything but ifplugd? It would surprise me...
> I can confirm carrier_off/on with 0 delay between the off and on
> doesn't work for ifplugd and networkd.
>
> Thanks,
> -- Dexuan
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Hyperv: Trigger DHCP renew after host hibernation

2014-08-13 Thread Tom Gundersen

On Tue, Aug 12, 2014 at 10:29 AM, Dexuan Cui de...@microsoft.com wrote:
 From: Tom Gundersen
  Unluckily this logic doesn't work because the user-space daemons
  like ifplugd, usually don't renew the DHCP immediately as long as they
  receive a link-down message: they usually wait for some seconds and if
  they find the link becomes up soon, they won't trigger renew operations.
  (I guess this behavior can be somewhat reasonable: maybe the daemons
  try to not trigger DHCP renew on temporary link instability)
 
 networkd does not suffer from this problem, and in ifplugd it can be
 I didn't have time to check the code of networkd, but I think it does have the
 same behavior.
 e.g., on a bare metal host with Ubuntu 14.04, when I plug the RJ45 cable out
 of the network card and then plug the cable back into the network card
 quickly -- in ~3 seconds, networkd doesn't trigger DHCP renew request: in
 /var/log/syslog, we see
 Aug 12 11:07:07 decui-lin NetworkManager[828]: info (eth0): carrier now OFF 
 (device state 100, deferring action for 4 seconds)
 Aug 12 11:07:07 decui-lin kernel: [  246.975453] e1000e: eth0 NIC Link is Down
 Aug 12 11:07:10 decui-lin NetworkManager[828]: info (eth0): carrier now ON 
 (device state 100)
 Aug 12 11:07:10 decui-lin kernel: [  250.028533] e1000e: eth0 NIC Link is Up 
 100 Mbps Full Duplex, Flow Control: Rx/Tx

 It looks there is a delay of 4s.
 I'm going to find out if there is a configurable parameter for this.

Just to avoid any confusion: you are referring to networkd (and so
did my comments), but the above logs are from NetworkManager.

 disabled. Most other network drivers will send
 IFF_LOWER_DOWN/IFF_LOWER_UP upon suspend/resume so if you were to
 do the same you will not work any worse than the others. What would be
 suspend/resume(ACPI S3?) is different  as this is usually  4 seconds?  :-)

Sure, but that's clearly not something we should rely on.

 nice, as mentioned by Dan and Lennart, would be to send an additional
 explicit event such as resumed from suspend or L3 may be wrong.
 Sorry, I neglected to reply this.
 IMHO even if we add the new event, we still need lots of efforts to
 make the various userland daemons(ifplugd, networkd, etc) to use the
 new event.
 And looks we're the first user of this new event. I'm not sure if this issue
 here can convince the network subsystem maintainers such a new event
 is a must.

 That should be a generic thing though, to fix all such issues in one
 go.
 I agree, though this requires we update all the userland daemons...

  I'm not sure our attempt to fix the daemons can be easily accepted.
  BTW, by CPUID, an application has a reliable way to determine  if it's
  running on hyper-v on not. Maybe we can fix the behavior of the
  daemons when they run on hyper-v?
  BTW2, according to my limited experience, I doubt other VMMs can
  handle this auto-DHCP-renew-in-guest issue properly.

 To the extent this is a problem, it is a generic one, so we should not
 need any hyper-v specific logic in userspace.
 OK, I understood.

  That was why Yue's patch wanted to add a SLEEP(10s) between the
  link-down and link-up events and hoped this could be an acceptable
  fix(while it turned out not, obviously), though we admit it's not so good
  to add such a magic number 10s in a kernel driver.
 
  Please point it out if I missed or misunderstand something.
 
  Now I understand it's not good to pass the event to the udev daemon,
  and it's not good to use a SLEEP(10s) in the kernel space(even if it's in a
  work task here).

 Please just expose to userspace what is happening (link lost/gained,
 resumed from suspend/...), and let us sort out how to react to that.
 If you put assumptions about what kind of timeouts (long-dead)
 userspace uses into your drivers you'll just create a mess.
 OK, I got it now.
 So I think I'm supposed to send out a netif_carrier_off()/on() patch,
 and I'd better do this after I verify the daemons can work with
 proper parameters specified.

  Please let me know if it's the correct direction to fix the user-space
  daemons (ifplugd, systemd-networkd, etc).
  If you think this is viable and we should do this, I'll submit a
  netif_carrier_off/on patch first and will start to work with the
  projects of ifplugd, systemd-networkd and many OSVs to make the
  while thing work eventually.

 Have you actually checked that carrier_off/on does not work on
 anything but ifplugd? It would surprise me...
 I can confirm carrier_off/on with 0 delay between the off and on
 doesn't work for ifplugd and networkd.

 Thanks,
 -- Dexuan
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Hyperv: Trigger DHCP renew after host hibernation

2014-08-11 Thread Tom Gundersen

On Mon, Aug 11, 2014 at 5:23 AM, Dexuan Cui  wrote:
>> -Original Message-
>> From: Greg KH [mailto:gre...@linuxfoundation.org]
>> > > >
>> > > > IMO the most feasible and need-the-least-change solution may be:
>> > > > the hyperv network VSC driver passes the event
>> > > > RNDIS_STATUS_NETWORK_CHANGE to the udev daemon?
>> > > >
>> > > No, don't do that, again, act like any other network device, drop the
>> > > link and bring it up when it comes back.
>> > >
>> > Hi Greg,
>> > Do you mean tearing down the net device and re-creating it (by
>> > register_netdev() and unregister_netdev)?
>>
>> No, don't you have link-detect for your network device?  Toggle that, I
>> thought patches to do this were posted a while ago...
>>
>> But if you really want to tear the whole network device down and then
>> back up again, sure, that would also work.
> Hi Greg, Stephen,
>
> Thanks for the comments!
>
> I suppose you meant the below logic:
> if (refresh) {
> rtnl_lock();
> netif_carrier_off(net);
> netif_carrier_on(net);
> rtnl_unlock();
> }
>
> We have discussed this in the previous mails of this thread itself:
> e.g., http://marc.info/?l=linux-driver-devel=140593811715975=2
>
> Unluckily this logic doesn't work because the user-space daemons
> like ifplugd, usually don't renew the DHCP immediately as long as they
> receive a link-down message: they usually wait for some seconds and if
> they find the link becomes up soon, they won't trigger renew operations.
> (I guess this behavior can be somewhat reasonable: maybe the daemons
> try to not trigger DHCP renew on temporary link instability)
>
> If we use this logic in the kernel space, we'll have to "fix" the user-space
> daemons, like ifplugd, systemd-networkd...,etc.

networkd does not suffer from this problem, and in ifplugd it can be
disabled. Most other network drivers will send
IFF_LOWER_DOWN/IFF_LOWER_UP upon suspend/resume so if you were to do
the same you will not work any worse than the others. What would be
nice, as mentioned by Dan and Lennart, would be to send an additional
explicit event such as "resumed from suspend" or "L3 may be wrong".
That should be a generic thing though, to fix all such issues in one
go.

> I'm not sure our attempt to "fix" the daemons can be easily accepted.
> BTW, by CPUID, an application has a reliable way to determine  if it's
> running on hyper-v on not. Maybe we can "fix" the behavior of the
> daemons when they run on hyper-v?
> BTW2, according to my limited experience, I doubt other VMMs can
> handle this auto-DHCP-renew-in-guest issue properly.

To the extent this is a problem, it is a generic one, so we should not
need any hyper-v specific logic in userspace.

> That was why Yue's patch wanted to add a SLEEP(10s) between the
> link-down and link-up events and hoped this could be an acceptable
> fix(while it turned out not, obviously), though we admit it's not so good
> to add such a magic number "10s" in a kernel driver.
>
> Please point it out if I missed or misunderstand something.
>
> Now I understand it's not good to pass the event to the udev daemon,
> and it's not good to use a SLEEP(10s) in the kernel space(even if it's in a
> "work" task here).

Please just expose to userspace what is happening (link lost/gained,
resumed from suspend/...), and let us sort out how to react to that.
If you put assumptions about what kind of timeouts (long-dead)
userspace uses into your drivers you'll just create a mess.

> Please let me know if it's the correct direction to fix the user-space
> daemons (ifplugd, systemd-networkd, etc).
> If you think this is viable and we should do this, I'll submit a
> netif_carrier_off/on patch first and will start to work with the
> projects of ifplugd, systemd-networkd and many OSVs to make the
> while thing work eventually.

Have you actually checked that carrier_off/on does not work on
anything but ifplugd? It would surprise me...

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Hyperv: Trigger DHCP renew after host hibernation

2014-08-11 Thread Tom Gundersen

On Mon, Aug 11, 2014 at 5:23 AM, Dexuan Cui de...@microsoft.com wrote:
 -Original Message-
 From: Greg KH [mailto:gre...@linuxfoundation.org]
   
IMO the most feasible and need-the-least-change solution may be:
the hyperv network VSC driver passes the event
RNDIS_STATUS_NETWORK_CHANGE to the udev daemon?
   
   No, don't do that, again, act like any other network device, drop the
   link and bring it up when it comes back.
  
  Hi Greg,
  Do you mean tearing down the net device and re-creating it (by
  register_netdev() and unregister_netdev)?

 No, don't you have link-detect for your network device?  Toggle that, I
 thought patches to do this were posted a while ago...

 But if you really want to tear the whole network device down and then
 back up again, sure, that would also work.
 Hi Greg, Stephen,

 Thanks for the comments!

 I suppose you meant the below logic:
 if (refresh) {
 rtnl_lock();
 netif_carrier_off(net);
 netif_carrier_on(net);
 rtnl_unlock();
 }

 We have discussed this in the previous mails of this thread itself:
 e.g., http://marc.info/?l=linux-driver-develm=140593811715975w=2

 Unluckily this logic doesn't work because the user-space daemons
 like ifplugd, usually don't renew the DHCP immediately as long as they
 receive a link-down message: they usually wait for some seconds and if
 they find the link becomes up soon, they won't trigger renew operations.
 (I guess this behavior can be somewhat reasonable: maybe the daemons
 try to not trigger DHCP renew on temporary link instability)

 If we use this logic in the kernel space, we'll have to fix the user-space
 daemons, like ifplugd, systemd-networkd...,etc.

networkd does not suffer from this problem, and in ifplugd it can be
disabled. Most other network drivers will send
IFF_LOWER_DOWN/IFF_LOWER_UP upon suspend/resume so if you were to do
the same you will not work any worse than the others. What would be
nice, as mentioned by Dan and Lennart, would be to send an additional
explicit event such as resumed from suspend or L3 may be wrong.
That should be a generic thing though, to fix all such issues in one
go.

 I'm not sure our attempt to fix the daemons can be easily accepted.
 BTW, by CPUID, an application has a reliable way to determine  if it's
 running on hyper-v on not. Maybe we can fix the behavior of the
 daemons when they run on hyper-v?
 BTW2, according to my limited experience, I doubt other VMMs can
 handle this auto-DHCP-renew-in-guest issue properly.

To the extent this is a problem, it is a generic one, so we should not
need any hyper-v specific logic in userspace.

 That was why Yue's patch wanted to add a SLEEP(10s) between the
 link-down and link-up events and hoped this could be an acceptable
 fix(while it turned out not, obviously), though we admit it's not so good
 to add such a magic number 10s in a kernel driver.

 Please point it out if I missed or misunderstand something.

 Now I understand it's not good to pass the event to the udev daemon,
 and it's not good to use a SLEEP(10s) in the kernel space(even if it's in a
 work task here).

Please just expose to userspace what is happening (link lost/gained,
resumed from suspend/...), and let us sort out how to react to that.
If you put assumptions about what kind of timeouts (long-dead)
userspace uses into your drivers you'll just create a mess.

 Please let me know if it's the correct direction to fix the user-space
 daemons (ifplugd, systemd-networkd, etc).
 If you think this is viable and we should do this, I'll submit a
 netif_carrier_off/on patch first and will start to work with the
 projects of ifplugd, systemd-networkd and many OSVs to make the
 while thing work eventually.

Have you actually checked that carrier_off/on does not work on
anything but ifplugd? It would surprise me...

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v7 08/33] net: nl80211 - make rdev_add_virtual_intf take name_assign_type

2014-07-21 Thread Tom Gundersen

Hi Johannes,

On Mon, Jul 21, 2014 at 11:40 AM, Johannes Berg
 wrote:
> On Thu, 2014-07-10 at 10:17 +0200, Tom Gundersen wrote:
>> Pass the value down and set it at the same place the name itself is set.
>
> Is this going to be applied as part of one bigger series?

This patch is superceded by "[PATCH net-next v9 2/9] net: nl80211 -
make rdev_add_virtual_intf take name_assign_type", which indeed is
part of a bigger series.

However, if you and David prefer to take it through your tree, let me
know and I'll split it out from the series and resubmit it to you
independently.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Hyperv: Trigger DHCP renew after host hibernation

2014-07-21 Thread Tom Gundersen

On Mon, Jul 21, 2014 at 12:21 PM, Yue Zhang (OSTC DEV)
 wrote:
>> From: Tom Gundersen [mailto:t...@jklm.no]
>> Sent: Monday, July 21, 2014 5:42 PM
>>
>> On Fri, Jul 18, 2014 at 12:55 PM, Yue Zhang  wrote:
>> > From: Yue Zhang 
>> >
>> > This patch addresses the comment from Olaf Hering and Greg KH
>> > for a previous commit 3a494e710367 ("hyperv: Add handler for
>> > RNDIS_STATUS_NETWORK_CHANGE event")
>> >
>> > In previous solution, the driver calls "network restart" to
>> > force a DHCP renew when the host is back from hibernation.
>> >
>> > In this fix, the driver will keep network carrier offline for
>> > 10 seconds and then bring it back. So that ifplugd daemon will
>> > notice this change and refresh DHCP lease.
>> >
>> > Cc: Haiyang Zhang 
>> > Cc: K. Y. Srinivasan 
>> >
>> > Signed-off-by: Yue Zhang 
>> > ---
>> >  drivers/net/hyperv/netvsc_drv.c | 21 +
>> >  1 file changed, 17 insertions(+), 4 deletions(-)
>> >
>> > diff --git a/drivers/net/hyperv/netvsc_drv.c
>> b/drivers/net/hyperv/netvsc_drv.c
>> > index a9c5eaa..559c97d 100644
>> > --- a/drivers/net/hyperv/netvsc_drv.c
>> > +++ b/drivers/net/hyperv/netvsc_drv.c
>> > @@ -33,6 +33,7 @@
>> >  #include 
>> >  #include 
>> >  #include 
>> > +#include 
>> >  #include 
>> >  #include 
>> >  #include 
>> > @@ -792,8 +793,7 @@ static void netvsc_link_change(struct work_struct
>> *w)
>> > struct netvsc_device *net_device;
>> > struct rndis_device *rdev;
>> > bool notify, refresh = false;
>> > -   char *argv[] = { "/etc/init.d/network", "restart", NULL };
>> > -   char *envp[] = { "HOME=/", "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
>> NULL };
>> > +   int delay;
>> >
>> > rtnl_lock();
>> >
>> > @@ -816,8 +816,21 @@ static void netvsc_link_change(struct work_struct
>> *w)
>> >
>> > rtnl_unlock();
>> >
>> > -   if (refresh)
>> > -   call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
>> > +   if (refresh) {
>> > +   /*
>> > +* Keep the carrier offline for 10 seconds
>> > +* to notify ifplugd daemon network change
>> > +*/
>> > +   for (delay = 0; delay < 10; delay++) {
>> > +   rtnl_lock();
>> > +   netif_carrier_off(net);
>> > +   rtnl_unlock();
>> > +   ssleep(1);
>> > +   }
>> > +   rtnl_lock();
>> > +   netif_carrier_on(net);
>> > +   rtnl_unlock();
>> > +   }
>>
>> Why is it necessary to wait for ten seconds? Why not just:
>>
>> if (refresh) {
>> rtnl_lock();
>> netif_carrier_off(net);
>> netif_carrier_on(net);
>> rtnl_unlock();
>> }
>>
>> At least systemd-networkd will renew the dhcp lease as long as it gets
>> NEWLINK messages indicating that the carrier was lost and regained,
>> regardless of the time between events. Is there any reason not to do
>> this?
>>
>> Cheers,
>>
>> Tom
>>
>
> Hi, Tom
>
> Some network monitoring daemon, like ifplugd has a deferring mechanism.
> When it detects carriers is offline, it doesn't trigger DHCP renew 
> immediately.
> Instead it will wait for another 5 seconds to check whether carrier is back to
> online status. In that case, it will avoid renew DHCP lease.
>
> And also there is some optimization in Linux's network stack. If link state
> flipped so quickly, like the code you proposed. It is very likely the event 
> won't
> be delivered to user space.

Ah, ok, I don't know the kernel side of this too well, you may need
some sort of flush or sync between the calls I suggested. At any rate,
I would say that the solution should be to send a "lower down"
followed immediately by "lower up" and never have any sort of timeout.
All the drivers I have tried send out such events immediately when the
machine is resumed, so I guess most network software should know how
to deal with that.

I really think the policy of what to do in response to the various
events should be left to userspace to figure out.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Hyperv: Trigger DHCP renew after host hibernation

2014-07-21 Thread Tom Gundersen

On Fri, Jul 18, 2014 at 12:55 PM, Yue Zhang  wrote:
> From: Yue Zhang 
>
> This patch addresses the comment from Olaf Hering and Greg KH
> for a previous commit 3a494e710367 ("hyperv: Add handler for
> RNDIS_STATUS_NETWORK_CHANGE event")
>
> In previous solution, the driver calls "network restart" to
> force a DHCP renew when the host is back from hibernation.
>
> In this fix, the driver will keep network carrier offline for
> 10 seconds and then bring it back. So that ifplugd daemon will
> notice this change and refresh DHCP lease.
>
> Cc: Haiyang Zhang 
> Cc: K. Y. Srinivasan 
>
> Signed-off-by: Yue Zhang 
> ---
>  drivers/net/hyperv/netvsc_drv.c | 21 +
>  1 file changed, 17 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> index a9c5eaa..559c97d 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -33,6 +33,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -792,8 +793,7 @@ static void netvsc_link_change(struct work_struct *w)
> struct netvsc_device *net_device;
> struct rndis_device *rdev;
> bool notify, refresh = false;
> -   char *argv[] = { "/etc/init.d/network", "restart", NULL };
> -   char *envp[] = { "HOME=/", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL 
> };
> +   int delay;
>
> rtnl_lock();
>
> @@ -816,8 +816,21 @@ static void netvsc_link_change(struct work_struct *w)
>
> rtnl_unlock();
>
> -   if (refresh)
> -   call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
> +   if (refresh) {
> +   /*
> +* Keep the carrier offline for 10 seconds
> +* to notify ifplugd daemon network change
> +*/
> +   for (delay = 0; delay < 10; delay++) {
> +   rtnl_lock();
> +   netif_carrier_off(net);
> +   rtnl_unlock();
> +   ssleep(1);
> +   }
> +   rtnl_lock();
> +   netif_carrier_on(net);
> +   rtnl_unlock();
> +   }

Why is it necessary to wait for ten seconds? Why not just:

if (refresh) {
rtnl_lock();
netif_carrier_off(net);
netif_carrier_on(net);
rtnl_unlock();
}

At least systemd-networkd will renew the dhcp lease as long as it gets
NEWLINK messages indicating that the carrier was lost and regained,
regardless of the time between events. Is there any reason not to do
this?

Cheers,

Tom

> if (notify)
> netdev_notify_peers(net);
> --
> 1.9.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Hyperv: Trigger DHCP renew after host hibernation

2014-07-21 Thread Tom Gundersen

On Fri, Jul 18, 2014 at 12:55 PM, Yue Zhang yue...@microsoft.com wrote:
 From: Yue Zhang yue...@microsoft.com

 This patch addresses the comment from Olaf Hering and Greg KH
 for a previous commit 3a494e710367 (hyperv: Add handler for
 RNDIS_STATUS_NETWORK_CHANGE event)

 In previous solution, the driver calls network restart to
 force a DHCP renew when the host is back from hibernation.

 In this fix, the driver will keep network carrier offline for
 10 seconds and then bring it back. So that ifplugd daemon will
 notice this change and refresh DHCP lease.

 Cc: Haiyang Zhang haiya...@microsoft.com
 Cc: K. Y. Srinivasan k...@microsoft.com

 Signed-off-by: Yue Zhang yue...@microsoft.com
 ---
  drivers/net/hyperv/netvsc_drv.c | 21 +
  1 file changed, 17 insertions(+), 4 deletions(-)

 diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
 index a9c5eaa..559c97d 100644
 --- a/drivers/net/hyperv/netvsc_drv.c
 +++ b/drivers/net/hyperv/netvsc_drv.c
 @@ -33,6 +33,7 @@
  #include linux/if_vlan.h
  #include linux/in.h
  #include linux/slab.h
 +#include linux/delay.h
  #include net/arp.h
  #include net/route.h
  #include net/sock.h
 @@ -792,8 +793,7 @@ static void netvsc_link_change(struct work_struct *w)
 struct netvsc_device *net_device;
 struct rndis_device *rdev;
 bool notify, refresh = false;
 -   char *argv[] = { /etc/init.d/network, restart, NULL };
 -   char *envp[] = { HOME=/, PATH=/sbin:/usr/sbin:/bin:/usr/bin, NULL 
 };
 +   int delay;

 rtnl_lock();

 @@ -816,8 +816,21 @@ static void netvsc_link_change(struct work_struct *w)

 rtnl_unlock();

 -   if (refresh)
 -   call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
 +   if (refresh) {
 +   /*
 +* Keep the carrier offline for 10 seconds
 +* to notify ifplugd daemon network change
 +*/
 +   for (delay = 0; delay  10; delay++) {
 +   rtnl_lock();
 +   netif_carrier_off(net);
 +   rtnl_unlock();
 +   ssleep(1);
 +   }
 +   rtnl_lock();
 +   netif_carrier_on(net);
 +   rtnl_unlock();
 +   }

Why is it necessary to wait for ten seconds? Why not just:

if (refresh) {
rtnl_lock();
netif_carrier_off(net);
netif_carrier_on(net);
rtnl_unlock();
}

At least systemd-networkd will renew the dhcp lease as long as it gets
NEWLINK messages indicating that the carrier was lost and regained,
regardless of the time between events. Is there any reason not to do
this?

Cheers,

Tom

 if (notify)
 netdev_notify_peers(net);
 --
 1.9.1

 --
 To unsubscribe from this list: send the line unsubscribe netdev in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Hyperv: Trigger DHCP renew after host hibernation

2014-07-21 Thread Tom Gundersen

On Mon, Jul 21, 2014 at 12:21 PM, Yue Zhang (OSTC DEV)
yue...@microsoft.com wrote:
 From: Tom Gundersen [mailto:t...@jklm.no]
 Sent: Monday, July 21, 2014 5:42 PM

 On Fri, Jul 18, 2014 at 12:55 PM, Yue Zhang yue...@microsoft.com wrote:
  From: Yue Zhang yue...@microsoft.com

  This patch addresses the comment from Olaf Hering and Greg KH
  for a previous commit 3a494e710367 (hyperv: Add handler for
  RNDIS_STATUS_NETWORK_CHANGE event)

  In previous solution, the driver calls network restart to
  force a DHCP renew when the host is back from hibernation.

  In this fix, the driver will keep network carrier offline for
  10 seconds and then bring it back. So that ifplugd daemon will
  notice this change and refresh DHCP lease.

  Cc: Haiyang Zhang haiya...@microsoft.com
  Cc: K. Y. Srinivasan k...@microsoft.com

  Signed-off-by: Yue Zhang yue...@microsoft.com
  ---
   drivers/net/hyperv/netvsc_drv.c | 21 +
   1 file changed, 17 insertions(+), 4 deletions(-)

  diff --git a/drivers/net/hyperv/netvsc_drv.c
 b/drivers/net/hyperv/netvsc_drv.c
  index a9c5eaa..559c97d 100644
  --- a/drivers/net/hyperv/netvsc_drv.c
  +++ b/drivers/net/hyperv/netvsc_drv.c
  @@ -33,6 +33,7 @@
   #include linux/if_vlan.h
   #include linux/in.h
   #include linux/slab.h
  +#include linux/delay.h
   #include net/arp.h
   #include net/route.h
   #include net/sock.h
  @@ -792,8 +793,7 @@ static void netvsc_link_change(struct work_struct
 *w)
  struct netvsc_device *net_device;
  struct rndis_device *rdev;
  bool notify, refresh = false;
  -   char *argv[] = { /etc/init.d/network, restart, NULL };
  -   char *envp[] = { HOME=/, PATH=/sbin:/usr/sbin:/bin:/usr/bin,
 NULL };
  +   int delay;

  rtnl_lock();

  @@ -816,8 +816,21 @@ static void netvsc_link_change(struct work_struct
 *w)

  rtnl_unlock();

  -   if (refresh)
  -   call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
  +   if (refresh) {
  +   /*
  +* Keep the carrier offline for 10 seconds
  +* to notify ifplugd daemon network change
  +*/
  +   for (delay = 0; delay  10; delay++) {
  +   rtnl_lock();
  +   netif_carrier_off(net);
  +   rtnl_unlock();
  +   ssleep(1);
  +   }
  +   rtnl_lock();
  +   netif_carrier_on(net);
  +   rtnl_unlock();
  +   }

 Why is it necessary to wait for ten seconds? Why not just:

 if (refresh) {
 rtnl_lock();
 netif_carrier_off(net);
 netif_carrier_on(net);
 rtnl_unlock();
 }

 At least systemd-networkd will renew the dhcp lease as long as it gets
 NEWLINK messages indicating that the carrier was lost and regained,
 regardless of the time between events. Is there any reason not to do
 this?

 Cheers,

 Tom

 Hi, Tom

 Some network monitoring daemon, like ifplugd has a deferring mechanism.
 When it detects carriers is offline, it doesn't trigger DHCP renew 
 immediately.
 Instead it will wait for another 5 seconds to check whether carrier is back to
 online status. In that case, it will avoid renew DHCP lease.

 And also there is some optimization in Linux's network stack. If link state
 flipped so quickly, like the code you proposed. It is very likely the event 
 won't
 be delivered to user space.

Ah, ok, I don't know the kernel side of this too well, you may need
some sort of flush or sync between the calls I suggested. At any rate,
I would say that the solution should be to send a lower down
followed immediately by lower up and never have any sort of timeout.
All the drivers I have tried send out such events immediately when the
machine is resumed, so I guess most network software should know how
to deal with that.

I really think the policy of what to do in response to the various
events should be left to userspace to figure out.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v7 08/33] net: nl80211 - make rdev_add_virtual_intf take name_assign_type

2014-07-21 Thread Tom Gundersen

Hi Johannes,

On Mon, Jul 21, 2014 at 11:40 AM, Johannes Berg
johan...@sipsolutions.net wrote:
 On Thu, 2014-07-10 at 10:17 +0200, Tom Gundersen wrote:
 Pass the value down and set it at the same place the name itself is set.

 Is this going to be applied as part of one bigger series?

This patch is superceded by [PATCH net-next v9 2/9] net: nl80211 -
make rdev_add_virtual_intf take name_assign_type, which indeed is
part of a bigger series.

However, if you and David prefer to take it through your tree, let me
know and I'll split it out from the series and resubmit it to you
independently.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH net-next v9 4/9] net: set name_assign_type when setting names via ioctls

2014-07-18 Thread Tom Gundersen

On Fri, Jul 18, 2014 at 1:20 AM, David Miller  wrote:
> From: Tom Gundersen 
> Date: Thu, 17 Jul 2014 10:06:05 +0200
>
>> @@ -2787,10 +2788,13 @@ static int gsm_create_network(struct gsm_dlci *dlci, 
>> struct gsm_netconfig *nc)
>>   pr_debug("create network interface");
>>
>>   netname = "gsm%d";
>> - if (nc->if_name[0] != '\0')
>> + if (nc->if_name[0] != '\0') {
>>   netname = nc->if_name;
>> - net = alloc_netdev(sizeof(struct gsm_mux_net), netname,
>> -NET_NAME_UNKNOWN, gsm_mux_net_init);
>> + name_assign_type = NET_NAME_USER;
>> + }
>> + net = alloc_netdev(sizeof(struct gsm_mux_net),
>> + netname, name_assign_type,
>> + gsm_mux_net_init);
>
> The indentation is now not correct.  For a function call, all arguments on the
> second and subsequent line, must start exactly at the first column after the
> openning parenthesis of the function call.

Indeed. Thanks, I'll fix this up.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH net-next v9 3/9] net: nl802154 - make add_iface take name assign type

2014-07-18 Thread Tom Gundersen

On Fri, Jul 18, 2014 at 1:19 AM, David Miller  wrote:
> From: Tom Gundersen 
> Date: Thu, 17 Jul 2014 10:06:04 +0200
>
>> @@ -192,8 +193,10 @@ int ieee802154_add_iface(struct sk_buff *skb, struct 
>> genl_info *info)
>>   if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1]
>>   != '\0')
>>   return -EINVAL; /* phy name should be null-terminated 
>> */
>> + name_assign_type = NET_NAME_USER;
>>   } else  {
>>   devname = "wpan%d";
>> + name_assign_type = NET_NAME_ENUM;
>>   }
>>
>>   if (strlen(devname) >= IFNAMSIZ)
>
> Just wondering what should happen if "%d" appears in a user provided name.
>
> That would seem to be both USER and ENUM.

Yes, this is a bit of a grey area.

I discussed this a bit with David Herrmann, and we landed on that
these names should be USER. As the user has explicitly asked for the
enumerated name, nobody should rename them to anything else (so ENUM
would have the wrong effect as it indicates that userspace should
rename the device), moreover, we should assume that the user knows
what they are doing, and that the enumerated names are fine in this
context.

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH net-next v9 3/9] net: nl802154 - make add_iface take name assign type

2014-07-18 Thread Tom Gundersen

On Fri, Jul 18, 2014 at 1:19 AM, David Miller da...@davemloft.net wrote:
 From: Tom Gundersen t...@jklm.no
 Date: Thu, 17 Jul 2014 10:06:04 +0200

 @@ -192,8 +193,10 @@ int ieee802154_add_iface(struct sk_buff *skb, struct 
 genl_info *info)
   if (devname[nla_len(info-attrs[IEEE802154_ATTR_DEV_NAME]) - 1]
   != '\0')
   return -EINVAL; /* phy name should be null-terminated 
 */
 + name_assign_type = NET_NAME_USER;
   } else  {
   devname = wpan%d;
 + name_assign_type = NET_NAME_ENUM;
   }

   if (strlen(devname) = IFNAMSIZ)

 Just wondering what should happen if %d appears in a user provided name.

 That would seem to be both USER and ENUM.

Yes, this is a bit of a grey area.

I discussed this a bit with David Herrmann, and we landed on that
these names should be USER. As the user has explicitly asked for the
enumerated name, nobody should rename them to anything else (so ENUM
would have the wrong effect as it indicates that userspace should
rename the device), moreover, we should assume that the user knows
what they are doing, and that the enumerated names are fine in this
context.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH net-next v9 4/9] net: set name_assign_type when setting names via ioctls

2014-07-18 Thread Tom Gundersen

On Fri, Jul 18, 2014 at 1:20 AM, David Miller da...@davemloft.net wrote:
 From: Tom Gundersen t...@jklm.no
 Date: Thu, 17 Jul 2014 10:06:05 +0200

 @@ -2787,10 +2788,13 @@ static int gsm_create_network(struct gsm_dlci *dlci, 
 struct gsm_netconfig *nc)
   pr_debug(create network interface);

   netname = gsm%d;
 - if (nc-if_name[0] != '\0')
 + if (nc-if_name[0] != '\0') {
   netname = nc-if_name;
 - net = alloc_netdev(sizeof(struct gsm_mux_net), netname,
 -NET_NAME_UNKNOWN, gsm_mux_net_init);
 + name_assign_type = NET_NAME_USER;
 + }
 + net = alloc_netdev(sizeof(struct gsm_mux_net),
 + netname, name_assign_type,
 + gsm_mux_net_init);

 The indentation is now not correct.  For a function call, all arguments on the
 second and subsequent line, must start exactly at the first column after the
 openning parenthesis of the function call.

Indeed. Thanks, I'll fix this up.

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH net-next v9 2/9] net: nl80211 - make rdev_add_virtual_intf take name_assign_type

2014-07-17 Thread Tom Gundersen

On Thu, Jul 17, 2014 at 11:04 AM, Johannes Berg
 wrote:
> On Thu, 2014-07-17 at 10:06 +0200, Tom Gundersen wrote:
>> Pass the value down and set it at the same place the name itself is set.
>
> Huh? What's this? Can you explain why?

Sure, sorry the context got lost.

We want to expose the origin of ifnames so that udev can better decide
whether or not to rename them. In particular, this patch will allow
udev to tell that it should no longer rename Wifi-P2P devices, which
is currently a problem.

A general overview is in the patch introducing the interface:
<http://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=685343fc3ba61a1f6eef361b786601123db16c28>.

>>  static struct wireless_dev *ath6kl_cfg80211_add_iface(struct wiphy *wiphy,
>> const char *name,
>> +   unsigned char 
>> name_assign_type,
>
> unsigned char for an enum is really odd.

Hm, this was based on the addr_assign_type attribute, which also is an
unsigned char (and serves a very similar purpose).

Out of interest, what would you have preferred (and why)?

Cheers,

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH net-next v9 3/9] net: nl802154 - make add_iface take name assign type

2014-07-17 Thread Tom Gundersen

Signed-off-by: Tom Gundersen 
Acked-by: Alexander Aring 
Cc: Dmitry Eremin-Solenikov 
Cc: linux-zigbee-de...@lists.sourceforge.net
---
 include/net/wpan-phy.h | 4 +++-
 net/ieee802154/nl-phy.c| 5 -
 net/mac802154/ieee802154_dev.c | 7 ---
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/net/wpan-phy.h b/include/net/wpan-phy.h
index 10ab0fc..bb07a1b 100644
--- a/include/net/wpan-phy.h
+++ b/include/net/wpan-phy.h
@@ -58,7 +58,9 @@ struct wpan_phy {
int idx;
 
struct net_device *(*add_iface)(struct wpan_phy *phy,
-   const char *name, int type);
+   const char *name,
+   unsigned char name_assign_type,
+   int type);
void (*del_iface)(struct wpan_phy *phy, struct net_device *dev);
 
int (*set_txpower)(struct wpan_phy *phy, int db);
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index 972baf8..5e1a28e 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -174,6 +174,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct 
genl_info *info)
struct wpan_phy *phy;
const char *name;
const char *devname;
+   unsigned char name_assign_type;
int rc = -ENOBUFS;
struct net_device *dev;
int type = __IEEE802154_DEV_INVALID;
@@ -192,8 +193,10 @@ int ieee802154_add_iface(struct sk_buff *skb, struct 
genl_info *info)
if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1]
!= '\0')
return -EINVAL; /* phy name should be null-terminated */
+   name_assign_type = NET_NAME_USER;
} else  {
devname = "wpan%d";
+   name_assign_type = NET_NAME_ENUM;
}
 
if (strlen(devname) >= IFNAMSIZ)
@@ -227,7 +230,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct 
genl_info *info)
}
}
 
-   dev = phy->add_iface(phy, devname, type);
+   dev = phy->add_iface(phy, devname, name_assign_type, type);
if (IS_ERR(dev)) {
rc = PTR_ERR(dev);
goto nla_put_failure;
diff --git a/net/mac802154/ieee802154_dev.c b/net/mac802154/ieee802154_dev.c
index b36b2b9..cee12b2 100644
--- a/net/mac802154/ieee802154_dev.c
+++ b/net/mac802154/ieee802154_dev.c
@@ -159,7 +159,8 @@ mac802154_del_iface(struct wpan_phy *phy, struct net_device 
*dev)
 }
 
 static struct net_device *
-mac802154_add_iface(struct wpan_phy *phy, const char *name, int type)
+mac802154_add_iface(struct wpan_phy *phy, const char *name,
+   unsigned char name_assign_type, int type)
 {
struct net_device *dev;
int err = -ENOMEM;
@@ -167,12 +168,12 @@ mac802154_add_iface(struct wpan_phy *phy, const char 
*name, int type)
switch (type) {
case IEEE802154_DEV_MONITOR:
dev = alloc_netdev(sizeof(struct mac802154_sub_if_data),
-  name, NET_NAME_UNKNOWN,
+  name, name_assign_type,
   mac802154_monitor_setup);
break;
case IEEE802154_DEV_WPAN:
dev = alloc_netdev(sizeof(struct mac802154_sub_if_data),
-  name, NET_NAME_UNKNOWN,
+  name, name_assign_type,
   mac802154_wpan_setup);
break;
default:
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH net-next v9 4/9] net: set name_assign_type when setting names via ioctls

2014-07-17 Thread Tom Gundersen

When the user provides a name, set NET_NAME_USER, otherwise an enumerated
name is used, so set NET_NAME_ENUM.

Signed-off-by: Tom Gundersen 
---
 drivers/net/tun.c |  9 ++---
 drivers/tty/n_gsm.c   | 10 +++---
 net/atm/br2684.c  |  5 +++--
 net/bluetooth/bnep/core.c |  2 +-
 net/bridge/br_if.c|  7 ---
 net/bridge/br_ioctl.c |  4 ++--
 net/bridge/br_private.h   |  2 +-
 7 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index acaaf67..eac34f5 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1607,6 +1607,7 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
}
else {
char *name;
+   unsigned char name_assign_type = NET_NAME_ENUM;
unsigned long flags = 0;
int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
 MAX_TAP_QUEUES : 1;
@@ -1629,12 +1630,14 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
} else
return -EINVAL;
 
-   if (*ifr->ifr_name)
+   if (*ifr->ifr_name) {
name = ifr->ifr_name;
+   name_assign_type = NET_NAME_USER;
+   }
 
dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
-  NET_NAME_UNKNOWN, tun_setup, queues,
-  queues);
+  name_assign_type, tun_setup,
+  queues, queues);
 
if (!dev)
return -ENOMEM;
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index cde3ab9..d0262d9 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -2767,6 +2767,7 @@ static void gsm_destroy_network(struct gsm_dlci *dlci)
 static int gsm_create_network(struct gsm_dlci *dlci, struct gsm_netconfig *nc)
 {
char *netname;
+   unsigned char name_assign_type = NET_NAME_ENUM;
int retval = 0;
struct net_device *net;
struct gsm_mux_net *mux_net;
@@ -2787,10 +2788,13 @@ static int gsm_create_network(struct gsm_dlci *dlci, 
struct gsm_netconfig *nc)
pr_debug("create network interface");
 
netname = "gsm%d";
-   if (nc->if_name[0] != '\0')
+   if (nc->if_name[0] != '\0') {
netname = nc->if_name;
-   net = alloc_netdev(sizeof(struct gsm_mux_net), netname,
-  NET_NAME_UNKNOWN, gsm_mux_net_init);
+   name_assign_type = NET_NAME_USER;
+   }
+   net = alloc_netdev(sizeof(struct gsm_mux_net),
+   netname, name_assign_type,
+   gsm_mux_net_init);
if (!net) {
pr_err("alloc_netdev failed");
return -ENOMEM;
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index cc78538..cb8d122 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -682,8 +682,9 @@ static int br2684_create(void __user *arg)
 
netdev = alloc_netdev(sizeof(struct br2684_dev),
  ni.ifname[0] ? ni.ifname : "nas%d",
- NET_NAME_UNKNOWN,
- (payload == p_routed) ? br2684_setup_routed : 
br2684_setup);
+ ni.ifname[0] ? NET_NAME_USER : NET_NAME_ENUM,
+ (payload == p_routed) ?
+ br2684_setup_routed : br2684_setup);
if (!netdev)
return -ENOMEM;
 
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index 85bcc21..6bed7b3 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -539,7 +539,7 @@ int bnep_add_connection(struct bnep_connadd_req *req, 
struct socket *sock)
/* session struct allocated as private part of net_device */
dev = alloc_netdev(sizeof(struct bnep_session),
   (*req->device) ? req->device : "bnep%d",
-  NET_NAME_UNKNOWN,
+  (*req->device) ? NET_NAME_USER : NET_NAME_ENUM,
   bnep_net_setup);
if (!dev)
return -ENOMEM;
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 078d336..f9c6766 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -339,13 +339,14 @@ static struct net_bridge_port *new_nbp(struct net_bridge 
*br,
return p;
 }
 
-int br_add_bridge(struct net *net, const char *name)
+int br_add_bridge(struct net *net, const char *name,
+ unsigned char name_assign_type)
 {
struct net_device *dev;
int res;
 
-   dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN,
-  br_dev_setup);
+   dev = alloc_netdev(sizeof

[PATCH net-next v9 5/9] net: bond - make bond_create take name_assign_type

2014-07-17 Thread Tom Gundersen

Signed-off-by: Tom Gundersen 
Cc: Jay Vosburgh 
Cc: Veaceslav Falico 
Cc: Andy Gospodarek 
---
 drivers/net/bonding/bond_main.c  | 7 ---
 drivers/net/bonding/bond_sysfs.c | 2 +-
 drivers/net/bonding/bonding.h| 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index d643807..1769745 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4421,7 +4421,7 @@ unsigned int bond_get_num_tx_queues(void)
  * Caller must NOT hold rtnl_lock; we need to release it here before we
  * set up our sysfs entries.
  */
-int bond_create(struct net *net, const char *name)
+int bond_create(struct net *net, const char *name, unsigned char 
name_assign_type)
 {
struct net_device *bond_dev;
int res;
@@ -4429,7 +4429,8 @@ int bond_create(struct net *net, const char *name)
rtnl_lock();
 
bond_dev = alloc_netdev_mq(sizeof(struct bonding),
-  name ? name : "bond%d", NET_NAME_UNKNOWN,
+  name ? name : "bond%d",
+  name ? name_assign_type : NET_NAME_ENUM,
   bond_setup, tx_queues);
if (!bond_dev) {
pr_err("%s: eek! can't alloc netdev!\n", name);
@@ -4509,7 +4510,7 @@ static int __init bonding_init(void)
bond_create_debugfs();
 
for (i = 0; i < max_bonds; i++) {
-   res = bond_create(_net, NULL);
+   res = bond_create(_net, NULL, NET_NAME_UNKNOWN);
if (res)
goto err;
}
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 98db8ed..7ac498c 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -111,7 +111,7 @@ static ssize_t bonding_store_bonds(struct class *cls,
 
if (command[0] == '+') {
pr_info("%s is being created...\n", ifname);
-   rv = bond_create(bn->net, ifname);
+   rv = bond_create(bn->net, ifname, NET_NAME_USER);
if (rv) {
if (rv == -EEXIST)
pr_info("%s already exists\n", ifname);
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index b2e548e..95b5e70 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -496,7 +496,7 @@ struct bond_net {
 
 int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, struct slave 
*slave);
 void bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct 
net_device *slave_dev);
-int bond_create(struct net *net, const char *name);
+int bond_create(struct net *net, const char *name, unsigned char 
name_assign_type);
 int bond_create_sysfs(struct bond_net *net);
 void bond_destroy_sysfs(struct bond_net *net);
 void bond_prepare_sysfs_group(struct bonding *bond);
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH net-next v9 9/9] net: ppp - set name assign type

2014-07-17 Thread Tom Gundersen

The ifname is of the form pppX where X is the unit number. This is either set
by userspace, or userspace requests the kernel to chose one, which is then
returned to userspace. Either way the creating user knows the name, so we
treat both cases as if the user had explicitly chosen the name and label
it NET_NAME_USER.

Signed-off-by: Tom Gundersen 
Cc: Paul Mackerras 
Cc: linux-...@vger.kernel.org
---
 drivers/net/ppp/ppp_generic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index c38ee90..6526057 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -2728,6 +2728,7 @@ ppp_create_interface(struct net *net, int unit, int *retp)
/* Initialize the new ppp unit */
ppp->file.index = unit;
sprintf(dev->name, "ppp%d", unit);
+   dev->name_assign_type = NET_NAME_USER;
 
ret = register_netdev(dev);
if (ret != 0) {
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH net-next v9 6/9] net: isdn - set name assign type

2014-07-17 Thread Tom Gundersen

Signed-off-by: Tom Gundersen 
Cc: Karsten Keil 
---
 drivers/isdn/i4l/isdn_common.c |  4 ++--
 drivers/isdn/i4l/isdn_net.c| 10 +-
 drivers/isdn/i4l/isdn_net.h|  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
index 9b856e1..67840ca 100644
--- a/drivers/isdn/i4l/isdn_common.c
+++ b/drivers/isdn/i4l/isdn_common.c
@@ -1364,7 +1364,7 @@ isdn_ioctl(struct file *file, uint cmd, ulong arg)
}
ret = mutex_lock_interruptible(>mtx);
if (ret) return ret;
-   if ((s = isdn_net_new(s, NULL))) {
+   if ((s = isdn_net_new(s, NET_NAME_USER, NULL))) {
if (copy_to_user(argp, s, strlen(s) + 1)) {
ret = -EFAULT;
} else {
@@ -1383,7 +1383,7 @@ isdn_ioctl(struct file *file, uint cmd, ulong arg)
return -EINVAL;
ret = mutex_lock_interruptible(>mtx);
if (ret) return ret;
-   if ((s = isdn_net_newslave(bname))) {
+   if ((s = isdn_net_newslave(bname, NET_NAME_USER))) {
if (copy_to_user(argp, s, strlen(s) + 1)) {
ret = -EFAULT;
} else {
diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c
index c2ed624..d16fec0 100644
--- a/drivers/isdn/i4l/isdn_net.c
+++ b/drivers/isdn/i4l/isdn_net.c
@@ -2573,7 +2573,7 @@ static void _isdn_setup(struct net_device *dev)
  * Allocate a new network-interface and initialize its data structures.
  */
 char *
-isdn_net_new(char *name, struct net_device *master)
+isdn_net_new(char *name, unsigned char name_assign_type, struct net_device 
*master)
 {
isdn_net_dev *netdev;
 
@@ -2588,8 +2588,8 @@ isdn_net_new(char *name, struct net_device *master)
printk(KERN_WARNING "isdn_net: Could not allocate 
net-device\n");
return NULL;
}
-   netdev->dev = alloc_netdev(sizeof(isdn_net_local), name,
-  NET_NAME_UNKNOWN, _isdn_setup);
+   netdev->dev = alloc_netdev(sizeof(isdn_net_local), name, 
name_assign_type,
+  _isdn_setup);
if (!netdev->dev) {
printk(KERN_WARNING "isdn_net: Could not allocate network 
device\n");
kfree(netdev);
@@ -2637,7 +2637,7 @@ isdn_net_new(char *name, struct net_device *master)
 }
 
 char *
-isdn_net_newslave(char *parm)
+isdn_net_newslave(char *parm, unsigned char name_assign_type)
 {
char *p = strchr(parm, ',');
isdn_net_dev *n;
@@ -2658,7 +2658,7 @@ isdn_net_newslave(char *parm)
/* Master must not be started yet */
if (isdn_net_device_started(n))
return NULL;
-   return (isdn_net_new(newname, n->dev));
+   return (isdn_net_new(newname, name_assign_type, n->dev));
}
return NULL;
 }
diff --git a/drivers/isdn/i4l/isdn_net.h b/drivers/isdn/i4l/isdn_net.h
index cca6d68..99f0c47 100644
--- a/drivers/isdn/i4l/isdn_net.h
+++ b/drivers/isdn/i4l/isdn_net.h
@@ -31,8 +31,8 @@
 #define CISCO_SLARP_REPLY 1
 #define CISCO_SLARP_KEEPALIVE 2
 
-extern char *isdn_net_new(char *, struct net_device *);
-extern char *isdn_net_newslave(char *);
+extern char *isdn_net_new(char *, unsigned char, struct net_device *);
+extern char *isdn_net_newslave(char *, unsigned char);
 extern int isdn_net_rm(char *);
 extern int isdn_net_rmall(void);
 extern int isdn_net_stat_callback(int, isdn_ctrl *);
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH net-next v9 7/9] net: vlan - set name assign type

2014-07-17 Thread Tom Gundersen

When deriving the name from the real device, inherit the assign type, otherwise
set PREDICTABLE as the name will be uniquely determined by the VLANID.

Signed-off-by: Tom Gundersen 
Cc: Patrick McHardy 
---
 net/8021q/vlan.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index cba9c21..cf88f7b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -212,6 +212,7 @@ static int register_vlan_device(struct net_device 
*real_dev, u16 vlan_id)
 {
struct net_device *new_dev;
struct vlan_dev_priv *vlan;
+   unsigned char name_assign_type;
struct net *net = dev_net(real_dev);
struct vlan_net *vn = net_generic(net, vlan_net_id);
char name[IFNAMSIZ];
@@ -229,18 +230,21 @@ static int register_vlan_device(struct net_device 
*real_dev, u16 vlan_id)
case VLAN_NAME_TYPE_RAW_PLUS_VID:
/* name will look like:  eth1.0005 */
snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id);
+   name_assign_type = real_dev->name_assign_type;
break;
case VLAN_NAME_TYPE_PLUS_VID_NO_PAD:
/* Put our vlan.VID in the name.
 * Name will look like:  vlan5
 */
snprintf(name, IFNAMSIZ, "vlan%i", vlan_id);
+   name_assign_type = NET_NAME_PREDICTABLE;
break;
case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD:
/* Put our vlan.VID in the name.
 * Name will look like:  eth0.5
 */
snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id);
+   name_assign_type = real_dev->name_assign_type;
break;
case VLAN_NAME_TYPE_PLUS_VID:
/* Put our vlan.VID in the name.
@@ -248,10 +252,11 @@ static int register_vlan_device(struct net_device 
*real_dev, u16 vlan_id)
 */
default:
snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id);
+   name_assign_type = NET_NAME_PREDICTABLE;
}
 
new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name,
-  NET_NAME_UNKNOWN, vlan_setup);
+  name_assign_type, vlan_setup);
 
if (new_dev == NULL)
return -ENOBUFS;
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH net-next v9 2/9] net: nl80211 - make rdev_add_virtual_intf take name_assign_type

2014-07-17 Thread Tom Gundersen

Pass the value down and set it at the same place the name itself is set.

Signed-off-by: Tom Gundersen 
Cc: Johannes Berg 
Cc: John Linville 
---
 drivers/net/wireless/ath/ath6kl/cfg80211.c|  6 --
 drivers/net/wireless/ath/ath6kl/cfg80211.h|  1 +
 drivers/net/wireless/ath/ath6kl/core.c|  4 ++--
 drivers/net/wireless/brcm80211/brcmfmac/dhd.h |  3 ++-
 drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c   | 10 ++
 drivers/net/wireless/brcm80211/brcmfmac/fweh.c|  2 +-
 drivers/net/wireless/brcm80211/brcmfmac/p2p.c |  3 +++
 drivers/net/wireless/brcm80211/brcmfmac/p2p.h |  1 +
 drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c |  4 +++-
 drivers/net/wireless/mwifiex/cfg80211.c   |  5 +++--
 drivers/net/wireless/mwifiex/main.c   |  2 +-
 drivers/net/wireless/mwifiex/main.h   |  1 +
 drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c |  6 +-
 include/net/cfg80211.h|  1 +
 net/mac80211/cfg.c|  3 ++-
 net/mac80211/ieee80211_i.h|  1 +
 net/mac80211/iface.c  |  3 ++-
 net/mac80211/main.c   |  2 +-
 net/wireless/nl80211.c|  3 ++-
 net/wireless/rdev-ops.h   |  5 +++--
 20 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c 
b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 1c4ce8e..3e0bc77 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -1494,6 +1494,7 @@ static int ath6kl_cfg80211_set_power_mgmt(struct wiphy 
*wiphy,
 
 static struct wireless_dev *ath6kl_cfg80211_add_iface(struct wiphy *wiphy,
  const char *name,
+ unsigned char 
name_assign_type,
  enum nl80211_iftype type,
  u32 *flags,
  struct vif_params *params)
@@ -1512,7 +1513,7 @@ static struct wireless_dev 
*ath6kl_cfg80211_add_iface(struct wiphy *wiphy,
return ERR_PTR(-EINVAL);
}
 
-   wdev = ath6kl_interface_add(ar, name, type, if_idx, nw_type);
+   wdev = ath6kl_interface_add(ar, name, name_assign_type, type, if_idx, 
nw_type);
if (!wdev)
return ERR_PTR(-ENOMEM);
 
@@ -3630,13 +3631,14 @@ void ath6kl_cfg80211_vif_cleanup(struct ath6kl_vif *vif)
 }
 
 struct wireless_dev *ath6kl_interface_add(struct ath6kl *ar, const char *name,
+ unsigned char name_assign_type,
  enum nl80211_iftype type,
  u8 fw_vif_idx, u8 nw_type)
 {
struct net_device *ndev;
struct ath6kl_vif *vif;
 
-   ndev = alloc_netdev(sizeof(*vif), name, NET_NAME_UNKNOWN, ether_setup);
+   ndev = alloc_netdev(sizeof(*vif), name, name_assign_type, ether_setup);
if (!ndev)
return NULL;
 
diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.h 
b/drivers/net/wireless/ath/ath6kl/cfg80211.h
index b59becd..5aa57a7 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.h
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.h
@@ -25,6 +25,7 @@ enum ath6kl_cfg_suspend_mode {
 };
 
 struct wireless_dev *ath6kl_interface_add(struct ath6kl *ar, const char *name,
+ unsigned char name_assign_type,
  enum nl80211_iftype type,
  u8 fw_vif_idx, u8 nw_type);
 void ath6kl_cfg80211_ch_switch_notify(struct ath6kl_vif *vif, int freq,
diff --git a/drivers/net/wireless/ath/ath6kl/core.c 
b/drivers/net/wireless/ath/ath6kl/core.c
index b0b6520..997ef42 100644
--- a/drivers/net/wireless/ath/ath6kl/core.c
+++ b/drivers/net/wireless/ath/ath6kl/core.c
@@ -195,8 +195,8 @@ int ath6kl_core_init(struct ath6kl *ar, enum 
ath6kl_htc_type htc_type)
rtnl_lock();
 
/* Add an initial station interface */
-   wdev = ath6kl_interface_add(ar, "wlan%d", NL80211_IFTYPE_STATION, 0,
-   INFRA_NETWORK);
+   wdev = ath6kl_interface_add(ar, "wlan%d", NET_NAME_ENUM,
+   NL80211_IFTYPE_STATION, 0, INFRA_NETWORK);
 
rtnl_unlock();
 
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/dhd.h 
b/drivers/net/wireless/brcm80211/brcmfmac/dhd.h
index a8998eb..4c27a78 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/dhd.h
+++ b/drivers/net/wireless/brcm80211/brcmfmac/dhd.h
@@ -171,7 +171,8 @@ char *brcmf_ifname(struct brcmf_pub *drvr, int idx);
 
 int brcmf_net_attach(struct brcmf_if *i

[PATCH net-next v9 8/9] net: openvswitch - set name assign type

2014-07-17 Thread Tom Gundersen

Signed-off-by: Tom Gundersen 
Cc: Pravin Shelar 
Cc: d...@openvswitch.org
---

v9: set NET_NAME_USER directly in internal_dev_create as requested by Pravin 
Shelar

 net/openvswitch/vport-internal_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/openvswitch/vport-internal_dev.c 
b/net/openvswitch/vport-internal_dev.c
index bd65855..2741285 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -165,7 +165,7 @@ static struct vport *internal_dev_create(const struct 
vport_parms *parms)
netdev_vport = netdev_vport_priv(vport);
 
netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev),
-parms->name, NET_NAME_UNKNOWN,
+parms->name, NET_NAME_USER,
 do_setup);
if (!netdev_vport->dev) {
err = -ENOMEM;
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH net-next v9 0/9] net: set the correct name_assign_type for more devices

2014-07-17 Thread Tom Gundersen

These patches give name assign types to the vast majority of outstanding 
devices.

Patch 1:give devices with static device names ("lo", etc) the type 
NET_NAME_PREDICTABLE
Patch 2-3:  give devices created over netlink (in particular Wifi-P2P, 
which is currently
a problem) the correct name assign type
Patch 4:do the same for devices created via ioctls
Patch 5-9:  cover a few individual drivers

More individual drivers will be converted in a follow-up series.

Tom Gundersen (9):
  net: set name assign type for names assigned using a static string
  net: nl80211 - make rdev_add_virtual_intf take name_assign_type
  net: nl802154 - make add_iface take name assign type
  net: set name_assign_type when setting names via ioctls
  net: bond - make bond_create take name_assign_type
  net: isdn - set name assign type
  net: vlan - set name assign type
  net: openvswitch - set name assign type
  net: ppp - set name assign type

 drivers/isdn/i4l/isdn_common.c|  4 ++--
 drivers/isdn/i4l/isdn_net.c   | 10 +-
 drivers/isdn/i4l/isdn_net.h   |  4 ++--
 drivers/media/dvb-core/dvb_net.c  |  2 +-
 drivers/misc/sgi-xp/xpnet.c   |  2 +-
 drivers/net/bonding/bond_main.c   |  7 ---
 drivers/net/bonding/bond_sysfs.c  |  2 +-
 drivers/net/bonding/bonding.h |  2 +-
 drivers/net/caif/caif_virtio.c|  2 +-
 drivers/net/eql.c |  4 ++--
 drivers/net/loopback.c|  2 +-
 drivers/net/ppp/ppp_generic.c |  1 +
 drivers/net/tun.c |  9 ++---
 drivers/net/wan/sbni.c|  2 +-
 drivers/net/wan/sdla.c|  4 ++--
 drivers/net/wireless/ath/ath6kl/cfg80211.c|  6 --
 drivers/net/wireless/ath/ath6kl/cfg80211.h|  1 +
 drivers/net/wireless/ath/ath6kl/core.c|  4 ++--
 drivers/net/wireless/brcm80211/brcmfmac/dhd.h |  3 ++-
 drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c   | 10 ++
 drivers/net/wireless/brcm80211/brcmfmac/fweh.c|  2 +-
 drivers/net/wireless/brcm80211/brcmfmac/p2p.c |  3 +++
 drivers/net/wireless/brcm80211/brcmfmac/p2p.h |  1 +
 drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c |  4 +++-
 drivers/net/wireless/mwifiex/cfg80211.c   |  5 +++--
 drivers/net/wireless/mwifiex/main.c   |  2 +-
 drivers/net/wireless/mwifiex/main.h   |  1 +
 drivers/s390/net/ctcm_main.c  |  4 ++--
 drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c |  6 +-
 drivers/staging/vt6655/wpactl.c   |  2 +-
 drivers/tty/n_gsm.c   | 10 +++---
 include/net/cfg80211.h|  1 +
 include/net/wpan-phy.h|  4 +++-
 net/8021q/vlan.c  |  7 ++-
 net/atm/br2684.c  |  5 +++--
 net/bluetooth/bnep/core.c |  2 +-
 net/bridge/br_if.c|  7 ---
 net/bridge/br_ioctl.c |  4 ++--
 net/bridge/br_private.h   |  2 +-
 net/ieee802154/nl-phy.c   |  5 -
 net/ipv6/ip6_gre.c|  2 +-
 net/ipv6/ip6_tunnel.c |  3 ++-
 net/ipv6/ip6_vti.c|  2 +-
 net/ipv6/sit.c|  2 +-
 net/mac80211/cfg.c|  3 ++-
 net/mac80211/ieee80211_i.h|  1 +
 net/mac80211/iface.c  |  3 ++-
 net/mac80211/main.c   |  2 +-
 net/mac802154/ieee802154_dev.c|  7 ---
 net/openvswitch/vport-internal_dev.c  |  2 +-
 net/wireless/nl80211.c|  3 ++-
 net/wireless/rdev-ops.h   |  5 +++--
 52 files changed, 120 insertions(+), 73 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH net-next v9 1/9] net: set name assign type for names assigned using a static string

2014-07-17 Thread Tom Gundersen

This covers the trivial case:

alloc_netdev(_, "bar", NET_NAME_PREDICTABLE, _);

Signed-off-by: Tom Gundersen 
---
 drivers/media/dvb-core/dvb_net.c | 2 +-
 drivers/misc/sgi-xp/xpnet.c  | 2 +-
 drivers/net/caif/caif_virtio.c   | 2 +-
 drivers/net/eql.c| 4 ++--
 drivers/net/loopback.c   | 2 +-
 drivers/net/wan/sbni.c   | 2 +-
 drivers/net/wan/sdla.c   | 4 ++--
 drivers/s390/net/ctcm_main.c | 4 ++--
 drivers/staging/vt6655/wpactl.c  | 2 +-
 net/ipv6/ip6_gre.c   | 2 +-
 net/ipv6/ip6_tunnel.c| 3 ++-
 net/ipv6/ip6_vti.c   | 2 +-
 net/ipv6/sit.c   | 2 +-
 13 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/drivers/media/dvb-core/dvb_net.c b/drivers/media/dvb-core/dvb_net.c
index 059e611..998baf6 100644
--- a/drivers/media/dvb-core/dvb_net.c
+++ b/drivers/media/dvb-core/dvb_net.c
@@ -1277,7 +1277,7 @@ static int dvb_net_add_if(struct dvb_net *dvbnet, u16 
pid, u8 feedtype)
return -EINVAL;
 
net = alloc_netdev(sizeof(struct dvb_net_priv), "dvb",
-  NET_NAME_UNKNOWN, dvb_net_setup);
+  NET_NAME_PREDICTABLE, dvb_net_setup);
if (!net)
return -ENOMEM;
 
diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c
index 557f978..2adb022 100644
--- a/drivers/misc/sgi-xp/xpnet.c
+++ b/drivers/misc/sgi-xp/xpnet.c
@@ -544,7 +544,7 @@ xpnet_init(void)
 * use ether_setup() to init the majority of our device
 * structure and then override the necessary pieces.
 */
-   xpnet_device = alloc_netdev(0, XPNET_DEVICE_NAME, NET_NAME_UNKNOWN,
+   xpnet_device = alloc_netdev(0, XPNET_DEVICE_NAME, NET_NAME_PREDICTABLE,
ether_setup);
if (xpnet_device == NULL) {
kfree(xpnet_broadcast_partitions);
diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
index a5fefb9..07c6a24 100644
--- a/drivers/net/caif/caif_virtio.c
+++ b/drivers/net/caif/caif_virtio.c
@@ -661,7 +661,7 @@ static int cfv_probe(struct virtio_device *vdev)
int err = -EINVAL;
 
netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name,
- NET_NAME_UNKNOWN, cfv_netdev_setup);
+ NET_NAME_PREDICTABLE, cfv_netdev_setup);
if (!netdev)
return -ENOMEM;
 
diff --git a/drivers/net/eql.c b/drivers/net/eql.c
index 957e5c0..5c1d986 100644
--- a/drivers/net/eql.c
+++ b/drivers/net/eql.c
@@ -585,8 +585,8 @@ static int __init eql_init_module(void)
 
pr_info("%s\n", version);
 
-   dev_eql = alloc_netdev(sizeof(equalizer_t), "eql", NET_NAME_UNKNOWN,
-  eql_setup);
+   dev_eql = alloc_netdev(sizeof(equalizer_t), "eql",
+  NET_NAME_PREDICTABLE, eql_setup);
if (!dev_eql)
return -ENOMEM;
 
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 8f22625..6077691 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -195,7 +195,7 @@ static __net_init int loopback_net_init(struct net *net)
int err;
 
err = -ENOMEM;
-   dev = alloc_netdev(0, "lo", NET_NAME_UNKNOWN, loopback_setup);
+   dev = alloc_netdev(0, "lo", NET_NAME_PREDICTABLE, loopback_setup);
if (!dev)
goto out;
 
diff --git a/drivers/net/wan/sbni.c b/drivers/net/wan/sbni.c
index 758c4ba..ea8ef51 100644
--- a/drivers/net/wan/sbni.c
+++ b/drivers/net/wan/sbni.c
@@ -228,7 +228,7 @@ int __init sbni_probe(int unit)
int err;
 
dev = alloc_netdev(sizeof(struct net_local), "sbni",
-  NET_NAME_UNKNOWN, sbni_devsetup);
+  NET_NAME_PREDICTABLE, sbni_devsetup);
if (!dev)
return -ENOMEM;
 
diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c
index 421ac5f..128d196 100644
--- a/drivers/net/wan/sdla.c
+++ b/drivers/net/wan/sdla.c
@@ -1631,8 +1631,8 @@ static int __init init_sdla(void)
 
printk("%s.\n", version);
 
-   sdla = alloc_netdev(sizeof(struct frad_local), "sdla0",
-   NET_NAME_UNKNOWN, setup_sdla);
+   sdla = alloc_netdev(sizeof(struct frad_local), "sdla0", 
NET_NAME_PREDICTABLE,
+   setup_sdla);
if (!sdla) 
return -ENOMEM;
 
diff --git a/drivers/s390/net/ctcm_main.c b/drivers/s390/net/ctcm_main.c
index e056dd4..5bf3254 100644
--- a/drivers/s390/net/ctcm_main.c
+++ b/drivers/s390/net/ctcm_main.c
@@ -1137,10 +1137,10 @@ static struct net_device *ctcm_init_netdevice(struct 
ctcm_priv *priv)
return NULL;
 
if (IS_MPC(priv))
-   dev = alloc_netdev(0, MPC_DEVICE_GENE, NET_NAME_UNKNOWN,
+   dev = alloc_net

Re: [PATCH v8] net: set name assign type for names assigned using a static string

2014-07-17 Thread Tom Gundersen

On Thu, Jul 17, 2014 at 9:26 AM, Veaceslav Falico  wrote:
> On Thu, Jul 17, 2014 at 09:17:07AM +0200, Tom Gundersen wrote:
>>
>> On Thu, Jul 17, 2014 at 8:56 AM, David Miller  wrote:
>>>
>>> Tom, even if the patches are sort of independent, they logically
>>> belong together.
>>>
>>> So please number them, and provide an appropriate "[PATCH 0/N] ..."
>>> cover letter.
>>>
>>> Please resubmit these patches with that done properly, thank you.
>>
>>
>> Ok. Will do.
>
>
> Also, it would be really nice if you could send patches as
> Doc/net/netdev-FAQ.txt suggests, as lots of us have scripts that parse the
> subject to find out which tree to apply/test the patches.
>
> It's also easier than using git notes edit/git send-email --notes :).
>
> Q: How do I indicate which tree (net vs. net-next) my patch should be in?
>
> A: Firstly, think whether you have a bug fix or new "next-like" content.
>   Then once decided, assuming that you use git, use the prefix flag, i.e.
>
>git format-patch --subject-prefix='PATCH net-next' start..finish
>
>   Use "net" instead of "net-next" (always lower case) in the above for
>   bug-fix net content.  If you don't use git, then note the only magic in
>   the above is just the subject text of the outgoing e-mail, and you can
>   manually change it yourself with whatever MUA you are comfortable with.

Ok. Will do.

Thanks.

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v8] net: set name assign type for names assigned using a static string

2014-07-17 Thread Tom Gundersen

On Thu, Jul 17, 2014 at 8:56 AM, David Miller  wrote:
> Tom, even if the patches are sort of independent, they logically
> belong together.
>
> So please number them, and provide an appropriate "[PATCH 0/N] ..."
> cover letter.
>
> Please resubmit these patches with that done properly, thank you.

Ok. Will do.

Thanks.

Tom
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v8] net: set name assign type for names assigned using a static string

2014-07-17 Thread Tom Gundersen

On Thu, Jul 17, 2014 at 8:56 AM, David Miller da...@davemloft.net wrote:
 Tom, even if the patches are sort of independent, they logically
 belong together.

 So please number them, and provide an appropriate [PATCH 0/N] ...
 cover letter.

 Please resubmit these patches with that done properly, thank you.

Ok. Will do.

Thanks.

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v8] net: set name assign type for names assigned using a static string

2014-07-17 Thread Tom Gundersen

On Thu, Jul 17, 2014 at 9:26 AM, Veaceslav Falico vfal...@redhat.com wrote:
 On Thu, Jul 17, 2014 at 09:17:07AM +0200, Tom Gundersen wrote:

 On Thu, Jul 17, 2014 at 8:56 AM, David Miller da...@davemloft.net wrote:

 Tom, even if the patches are sort of independent, they logically
 belong together.

 So please number them, and provide an appropriate [PATCH 0/N] ...
 cover letter.

 Please resubmit these patches with that done properly, thank you.


 Ok. Will do.


 Also, it would be really nice if you could send patches as
 Doc/net/netdev-FAQ.txt suggests, as lots of us have scripts that parse the
 subject to find out which tree to apply/test the patches.

 It's also easier than using git notes edit/git send-email --notes :).

 Q: How do I indicate which tree (net vs. net-next) my patch should be in?

 A: Firstly, think whether you have a bug fix or new next-like content.
   Then once decided, assuming that you use git, use the prefix flag, i.e.

git format-patch --subject-prefix='PATCH net-next' start..finish

   Use net instead of net-next (always lower case) in the above for
   bug-fix net content.  If you don't use git, then note the only magic in
   the above is just the subject text of the outgoing e-mail, and you can
   manually change it yourself with whatever MUA you are comfortable with.

Ok. Will do.

Thanks.

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH net-next v9 0/9] net: set the correct name_assign_type for more devices

2014-07-17 Thread Tom Gundersen

These patches give name assign types to the vast majority of outstanding 
devices.

Patch 1:give devices with static device names (lo, etc) the type 
NET_NAME_PREDICTABLE
Patch 2-3:  give devices created over netlink (in particular Wifi-P2P, 
which is currently
a problem) the correct name assign type
Patch 4:do the same for devices created via ioctls
Patch 5-9:  cover a few individual drivers

More individual drivers will be converted in a follow-up series.

Tom Gundersen (9):
  net: set name assign type for names assigned using a static string
  net: nl80211 - make rdev_add_virtual_intf take name_assign_type
  net: nl802154 - make add_iface take name assign type
  net: set name_assign_type when setting names via ioctls
  net: bond - make bond_create take name_assign_type
  net: isdn - set name assign type
  net: vlan - set name assign type
  net: openvswitch - set name assign type
  net: ppp - set name assign type

 drivers/isdn/i4l/isdn_common.c|  4 ++--
 drivers/isdn/i4l/isdn_net.c   | 10 +-
 drivers/isdn/i4l/isdn_net.h   |  4 ++--
 drivers/media/dvb-core/dvb_net.c  |  2 +-
 drivers/misc/sgi-xp/xpnet.c   |  2 +-
 drivers/net/bonding/bond_main.c   |  7 ---
 drivers/net/bonding/bond_sysfs.c  |  2 +-
 drivers/net/bonding/bonding.h |  2 +-
 drivers/net/caif/caif_virtio.c|  2 +-
 drivers/net/eql.c |  4 ++--
 drivers/net/loopback.c|  2 +-
 drivers/net/ppp/ppp_generic.c |  1 +
 drivers/net/tun.c |  9 ++---
 drivers/net/wan/sbni.c|  2 +-
 drivers/net/wan/sdla.c|  4 ++--
 drivers/net/wireless/ath/ath6kl/cfg80211.c|  6 --
 drivers/net/wireless/ath/ath6kl/cfg80211.h|  1 +
 drivers/net/wireless/ath/ath6kl/core.c|  4 ++--
 drivers/net/wireless/brcm80211/brcmfmac/dhd.h |  3 ++-
 drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c   | 10 ++
 drivers/net/wireless/brcm80211/brcmfmac/fweh.c|  2 +-
 drivers/net/wireless/brcm80211/brcmfmac/p2p.c |  3 +++
 drivers/net/wireless/brcm80211/brcmfmac/p2p.h |  1 +
 drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c |  4 +++-
 drivers/net/wireless/mwifiex/cfg80211.c   |  5 +++--
 drivers/net/wireless/mwifiex/main.c   |  2 +-
 drivers/net/wireless/mwifiex/main.h   |  1 +
 drivers/s390/net/ctcm_main.c  |  4 ++--
 drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c |  6 +-
 drivers/staging/vt6655/wpactl.c   |  2 +-
 drivers/tty/n_gsm.c   | 10 +++---
 include/net/cfg80211.h|  1 +
 include/net/wpan-phy.h|  4 +++-
 net/8021q/vlan.c  |  7 ++-
 net/atm/br2684.c  |  5 +++--
 net/bluetooth/bnep/core.c |  2 +-
 net/bridge/br_if.c|  7 ---
 net/bridge/br_ioctl.c |  4 ++--
 net/bridge/br_private.h   |  2 +-
 net/ieee802154/nl-phy.c   |  5 -
 net/ipv6/ip6_gre.c|  2 +-
 net/ipv6/ip6_tunnel.c |  3 ++-
 net/ipv6/ip6_vti.c|  2 +-
 net/ipv6/sit.c|  2 +-
 net/mac80211/cfg.c|  3 ++-
 net/mac80211/ieee80211_i.h|  1 +
 net/mac80211/iface.c  |  3 ++-
 net/mac80211/main.c   |  2 +-
 net/mac802154/ieee802154_dev.c|  7 ---
 net/openvswitch/vport-internal_dev.c  |  2 +-
 net/wireless/nl80211.c|  3 ++-
 net/wireless/rdev-ops.h   |  5 +++--
 52 files changed, 120 insertions(+), 73 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH net-next v9 1/9] net: set name assign type for names assigned using a static string

2014-07-17 Thread Tom Gundersen

This covers the trivial case:

alloc_netdev(_, bar, NET_NAME_PREDICTABLE, _);

Signed-off-by: Tom Gundersen t...@jklm.no
---
 drivers/media/dvb-core/dvb_net.c | 2 +-
 drivers/misc/sgi-xp/xpnet.c  | 2 +-
 drivers/net/caif/caif_virtio.c   | 2 +-
 drivers/net/eql.c| 4 ++--
 drivers/net/loopback.c   | 2 +-
 drivers/net/wan/sbni.c   | 2 +-
 drivers/net/wan/sdla.c   | 4 ++--
 drivers/s390/net/ctcm_main.c | 4 ++--
 drivers/staging/vt6655/wpactl.c  | 2 +-
 net/ipv6/ip6_gre.c   | 2 +-
 net/ipv6/ip6_tunnel.c| 3 ++-
 net/ipv6/ip6_vti.c   | 2 +-
 net/ipv6/sit.c   | 2 +-
 13 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/drivers/media/dvb-core/dvb_net.c b/drivers/media/dvb-core/dvb_net.c
index 059e611..998baf6 100644
--- a/drivers/media/dvb-core/dvb_net.c
+++ b/drivers/media/dvb-core/dvb_net.c
@@ -1277,7 +1277,7 @@ static int dvb_net_add_if(struct dvb_net *dvbnet, u16 
pid, u8 feedtype)
return -EINVAL;
 
net = alloc_netdev(sizeof(struct dvb_net_priv), dvb,
-  NET_NAME_UNKNOWN, dvb_net_setup);
+  NET_NAME_PREDICTABLE, dvb_net_setup);
if (!net)
return -ENOMEM;
 
diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c
index 557f978..2adb022 100644
--- a/drivers/misc/sgi-xp/xpnet.c
+++ b/drivers/misc/sgi-xp/xpnet.c
@@ -544,7 +544,7 @@ xpnet_init(void)
 * use ether_setup() to init the majority of our device
 * structure and then override the necessary pieces.
 */
-   xpnet_device = alloc_netdev(0, XPNET_DEVICE_NAME, NET_NAME_UNKNOWN,
+   xpnet_device = alloc_netdev(0, XPNET_DEVICE_NAME, NET_NAME_PREDICTABLE,
ether_setup);
if (xpnet_device == NULL) {
kfree(xpnet_broadcast_partitions);
diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
index a5fefb9..07c6a24 100644
--- a/drivers/net/caif/caif_virtio.c
+++ b/drivers/net/caif/caif_virtio.c
@@ -661,7 +661,7 @@ static int cfv_probe(struct virtio_device *vdev)
int err = -EINVAL;
 
netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name,
- NET_NAME_UNKNOWN, cfv_netdev_setup);
+ NET_NAME_PREDICTABLE, cfv_netdev_setup);
if (!netdev)
return -ENOMEM;
 
diff --git a/drivers/net/eql.c b/drivers/net/eql.c
index 957e5c0..5c1d986 100644
--- a/drivers/net/eql.c
+++ b/drivers/net/eql.c
@@ -585,8 +585,8 @@ static int __init eql_init_module(void)
 
pr_info(%s\n, version);
 
-   dev_eql = alloc_netdev(sizeof(equalizer_t), eql, NET_NAME_UNKNOWN,
-  eql_setup);
+   dev_eql = alloc_netdev(sizeof(equalizer_t), eql,
+  NET_NAME_PREDICTABLE, eql_setup);
if (!dev_eql)
return -ENOMEM;
 
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 8f22625..6077691 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -195,7 +195,7 @@ static __net_init int loopback_net_init(struct net *net)
int err;
 
err = -ENOMEM;
-   dev = alloc_netdev(0, lo, NET_NAME_UNKNOWN, loopback_setup);
+   dev = alloc_netdev(0, lo, NET_NAME_PREDICTABLE, loopback_setup);
if (!dev)
goto out;
 
diff --git a/drivers/net/wan/sbni.c b/drivers/net/wan/sbni.c
index 758c4ba..ea8ef51 100644
--- a/drivers/net/wan/sbni.c
+++ b/drivers/net/wan/sbni.c
@@ -228,7 +228,7 @@ int __init sbni_probe(int unit)
int err;
 
dev = alloc_netdev(sizeof(struct net_local), sbni,
-  NET_NAME_UNKNOWN, sbni_devsetup);
+  NET_NAME_PREDICTABLE, sbni_devsetup);
if (!dev)
return -ENOMEM;
 
diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c
index 421ac5f..128d196 100644
--- a/drivers/net/wan/sdla.c
+++ b/drivers/net/wan/sdla.c
@@ -1631,8 +1631,8 @@ static int __init init_sdla(void)
 
printk(%s.\n, version);
 
-   sdla = alloc_netdev(sizeof(struct frad_local), sdla0,
-   NET_NAME_UNKNOWN, setup_sdla);
+   sdla = alloc_netdev(sizeof(struct frad_local), sdla0, 
NET_NAME_PREDICTABLE,
+   setup_sdla);
if (!sdla) 
return -ENOMEM;
 
diff --git a/drivers/s390/net/ctcm_main.c b/drivers/s390/net/ctcm_main.c
index e056dd4..5bf3254 100644
--- a/drivers/s390/net/ctcm_main.c
+++ b/drivers/s390/net/ctcm_main.c
@@ -1137,10 +1137,10 @@ static struct net_device *ctcm_init_netdevice(struct 
ctcm_priv *priv)
return NULL;
 
if (IS_MPC(priv))
-   dev = alloc_netdev(0, MPC_DEVICE_GENE, NET_NAME_UNKNOWN,
+   dev = alloc_netdev(0, MPC_DEVICE_GENE, NET_NAME_PREDICTABLE,
   ctcm_dev_setup);
else
-   dev

[PATCH net-next v9 2/9] net: nl80211 - make rdev_add_virtual_intf take name_assign_type

2014-07-17 Thread Tom Gundersen

Pass the value down and set it at the same place the name itself is set.

Signed-off-by: Tom Gundersen t...@jklm.no
Cc: Johannes Berg johan...@sipsolutions.net
Cc: John Linville linvi...@tuxdriver.com
---
 drivers/net/wireless/ath/ath6kl/cfg80211.c|  6 --
 drivers/net/wireless/ath/ath6kl/cfg80211.h|  1 +
 drivers/net/wireless/ath/ath6kl/core.c|  4 ++--
 drivers/net/wireless/brcm80211/brcmfmac/dhd.h |  3 ++-
 drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c   | 10 ++
 drivers/net/wireless/brcm80211/brcmfmac/fweh.c|  2 +-
 drivers/net/wireless/brcm80211/brcmfmac/p2p.c |  3 +++
 drivers/net/wireless/brcm80211/brcmfmac/p2p.h |  1 +
 drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c |  4 +++-
 drivers/net/wireless/mwifiex/cfg80211.c   |  5 +++--
 drivers/net/wireless/mwifiex/main.c   |  2 +-
 drivers/net/wireless/mwifiex/main.h   |  1 +
 drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c |  6 +-
 include/net/cfg80211.h|  1 +
 net/mac80211/cfg.c|  3 ++-
 net/mac80211/ieee80211_i.h|  1 +
 net/mac80211/iface.c  |  3 ++-
 net/mac80211/main.c   |  2 +-
 net/wireless/nl80211.c|  3 ++-
 net/wireless/rdev-ops.h   |  5 +++--
 20 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c 
b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 1c4ce8e..3e0bc77 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -1494,6 +1494,7 @@ static int ath6kl_cfg80211_set_power_mgmt(struct wiphy 
*wiphy,
 
 static struct wireless_dev *ath6kl_cfg80211_add_iface(struct wiphy *wiphy,
  const char *name,
+ unsigned char 
name_assign_type,
  enum nl80211_iftype type,
  u32 *flags,
  struct vif_params *params)
@@ -1512,7 +1513,7 @@ static struct wireless_dev 
*ath6kl_cfg80211_add_iface(struct wiphy *wiphy,
return ERR_PTR(-EINVAL);
}
 
-   wdev = ath6kl_interface_add(ar, name, type, if_idx, nw_type);
+   wdev = ath6kl_interface_add(ar, name, name_assign_type, type, if_idx, 
nw_type);
if (!wdev)
return ERR_PTR(-ENOMEM);
 
@@ -3630,13 +3631,14 @@ void ath6kl_cfg80211_vif_cleanup(struct ath6kl_vif *vif)
 }
 
 struct wireless_dev *ath6kl_interface_add(struct ath6kl *ar, const char *name,
+ unsigned char name_assign_type,
  enum nl80211_iftype type,
  u8 fw_vif_idx, u8 nw_type)
 {
struct net_device *ndev;
struct ath6kl_vif *vif;
 
-   ndev = alloc_netdev(sizeof(*vif), name, NET_NAME_UNKNOWN, ether_setup);
+   ndev = alloc_netdev(sizeof(*vif), name, name_assign_type, ether_setup);
if (!ndev)
return NULL;
 
diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.h 
b/drivers/net/wireless/ath/ath6kl/cfg80211.h
index b59becd..5aa57a7 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.h
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.h
@@ -25,6 +25,7 @@ enum ath6kl_cfg_suspend_mode {
 };
 
 struct wireless_dev *ath6kl_interface_add(struct ath6kl *ar, const char *name,
+ unsigned char name_assign_type,
  enum nl80211_iftype type,
  u8 fw_vif_idx, u8 nw_type);
 void ath6kl_cfg80211_ch_switch_notify(struct ath6kl_vif *vif, int freq,
diff --git a/drivers/net/wireless/ath/ath6kl/core.c 
b/drivers/net/wireless/ath/ath6kl/core.c
index b0b6520..997ef42 100644
--- a/drivers/net/wireless/ath/ath6kl/core.c
+++ b/drivers/net/wireless/ath/ath6kl/core.c
@@ -195,8 +195,8 @@ int ath6kl_core_init(struct ath6kl *ar, enum 
ath6kl_htc_type htc_type)
rtnl_lock();
 
/* Add an initial station interface */
-   wdev = ath6kl_interface_add(ar, wlan%d, NL80211_IFTYPE_STATION, 0,
-   INFRA_NETWORK);
+   wdev = ath6kl_interface_add(ar, wlan%d, NET_NAME_ENUM,
+   NL80211_IFTYPE_STATION, 0, INFRA_NETWORK);
 
rtnl_unlock();
 
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/dhd.h 
b/drivers/net/wireless/brcm80211/brcmfmac/dhd.h
index a8998eb..4c27a78 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/dhd.h
+++ b/drivers/net/wireless/brcm80211/brcmfmac/dhd.h
@@ -171,7 +171,8 @@ char *brcmf_ifname(struct brcmf_pub *drvr, int idx);
 
 int

1 2 3 4 >

1 - 100 of 394 matches

Mail list logo