Re: [Xen-devel][RFC 3/3] xen/gntdev: Add support for Linux dma buffers

2018-05-21 Thread Oleksandr Andrushchenko

On 05/22/2018 12:31 AM, Dongwon Kim wrote:

Still need more time to review the whole code changes

Take your time, I just wanted to make sure that all interested parties
are in the discussion, so we all finally have what we all want, not
a thing covering only my use-cases

  but I noticed one thing.

We've been using the term "hyper_dmabuf" for hypervisor-agnostic linux dmabuf
solution and we are planning to call any of our future solution for other
hypervisors the same name. So having same name for this xen-specific structure
or functions you implemented is confusing. Would you change it to something
else like... "xen_"?

Np, will rename


On Thu, May 17, 2018 at 11:26:04AM +0300, Oleksandr Andrushchenko wrote:

From: Oleksandr Andrushchenko 

Signed-off-by: Oleksandr Andrushchenko 
---
  drivers/xen/gntdev.c  | 954 +-
  include/uapi/xen/gntdev.h | 101 
  include/xen/gntdev_exp.h  |  23 +
  3 files changed, 1066 insertions(+), 12 deletions(-)
  create mode 100644 include/xen/gntdev_exp.h

diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 9510f228efe9..0ee88e193362 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -4,6 +4,8 @@
   * Device for accessing (in user-space) pages that have been granted by other
   * domains.
   *
+ * DMA buffer implementation is based on drivers/gpu/drm/drm_prime.c.
+ *
   * Copyright (c) 2006-2007, D G Murray.
   *   (c) 2009 Gerd Hoffmann 
   *
@@ -37,6 +39,9 @@
  #include 
  #include 
  
+#include 

+#include 
+
  #include 
  #include 
  #include 
@@ -61,16 +66,39 @@ static atomic_t pages_mapped = ATOMIC_INIT(0);
  static int use_ptemod;
  #define populate_freeable_maps use_ptemod
  
+#ifndef GRANT_INVALID_REF

+/*
+ * Note on usage of grant reference 0 as invalid grant reference:
+ * grant reference 0 is valid, but never exposed to a driver,
+ * because of the fact it is already in use/reserved by the PV console.
+ */
+#define GRANT_INVALID_REF  0
+#endif
+
  struct gntdev_priv {
/* maps with visible offsets in the file descriptor */
struct list_head maps;
/* maps that are not visible; will be freed on munmap.
 * Only populated if populate_freeable_maps == 1 */
struct list_head freeable_maps;
+   /* List of dma-bufs. */
+   struct list_head dma_bufs;
/* lock protects maps and freeable_maps */
struct mutex lock;
struct mm_struct *mm;
struct mmu_notifier mn;
+
+   /* Private data of the hyper DMA buffers. */
+
+   struct device *dev;
+   /* List of exported DMA buffers. */
+   struct list_head dmabuf_exp_list;
+   /* List of wait objects. */
+   struct list_head dmabuf_exp_wait_list;
+   /* List of imported DMA buffers. */
+   struct list_head dmabuf_imp_list;
+   /* This is the lock which protects dma_buf_xxx lists. */
+   struct mutex dmabuf_lock;
  };
  
  struct unmap_notify {

@@ -95,10 +123,65 @@ struct grant_map {
struct gnttab_unmap_grant_ref *kunmap_ops;
struct page **pages;
unsigned long pages_vm_start;
+
+   /*
+* All the fields starting with dmabuf_ are only valid if this
+* mapping is used for exporting a DMA buffer.
+* If dmabuf_vaddr is not NULL then this mapping is backed by DMA
+* capable memory.
+*/
+
+   /* Flags used to create this DMA buffer: GNTDEV_DMABUF_FLAG_XXX. */
+   bool dmabuf_flags;
+   /* Virtual/CPU address of the DMA buffer. */
+   void *dmabuf_vaddr;
+   /* Bus address of the DMA buffer. */
+   dma_addr_t dmabuf_bus_addr;
+};
+
+struct hyper_dmabuf {
+   struct gntdev_priv *priv;
+   struct dma_buf *dmabuf;
+   struct list_head next;
+   int fd;
+
+   union {
+   struct {
+   /* Exported buffers are reference counted. */
+   struct kref refcount;
+   struct grant_map *map;
+   } exp;
+   struct {
+   /* Granted references of the imported buffer. */
+   grant_ref_t *refs;
+   /* Scatter-gather table of the imported buffer. */
+   struct sg_table *sgt;
+   /* dma-buf attachment of the imported buffer. */
+   struct dma_buf_attachment *attach;
+   } imp;
+   } u;
+
+   /* Number of pages this buffer has. */
+   int nr_pages;
+   /* Pages of this buffer. */
+   struct page **pages;
+};
+
+struct hyper_dmabuf_wait_obj {
+   struct list_head next;
+   struct hyper_dmabuf *hyper_dmabuf;
+   struct completion completion;
+};
+
+struct hyper_dambuf_attachment {

minor typo: dam->dma (same thing in other places as well.)

sure, thanks



+   struct sg_table *sgt;
+   enum dma_data_direction 

Re: [Xen-devel][RFC 3/3] xen/gntdev: Add support for Linux dma buffers

2018-05-21 Thread Oleksandr Andrushchenko

On 05/22/2018 12:31 AM, Dongwon Kim wrote:

Still need more time to review the whole code changes

Take your time, I just wanted to make sure that all interested parties
are in the discussion, so we all finally have what we all want, not
a thing covering only my use-cases

  but I noticed one thing.

We've been using the term "hyper_dmabuf" for hypervisor-agnostic linux dmabuf
solution and we are planning to call any of our future solution for other
hypervisors the same name. So having same name for this xen-specific structure
or functions you implemented is confusing. Would you change it to something
else like... "xen_"?

Np, will rename


On Thu, May 17, 2018 at 11:26:04AM +0300, Oleksandr Andrushchenko wrote:

From: Oleksandr Andrushchenko 

Signed-off-by: Oleksandr Andrushchenko 
---
  drivers/xen/gntdev.c  | 954 +-
  include/uapi/xen/gntdev.h | 101 
  include/xen/gntdev_exp.h  |  23 +
  3 files changed, 1066 insertions(+), 12 deletions(-)
  create mode 100644 include/xen/gntdev_exp.h

diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 9510f228efe9..0ee88e193362 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -4,6 +4,8 @@
   * Device for accessing (in user-space) pages that have been granted by other
   * domains.
   *
+ * DMA buffer implementation is based on drivers/gpu/drm/drm_prime.c.
+ *
   * Copyright (c) 2006-2007, D G Murray.
   *   (c) 2009 Gerd Hoffmann 
   *
@@ -37,6 +39,9 @@
  #include 
  #include 
  
+#include 

+#include 
+
  #include 
  #include 
  #include 
@@ -61,16 +66,39 @@ static atomic_t pages_mapped = ATOMIC_INIT(0);
  static int use_ptemod;
  #define populate_freeable_maps use_ptemod
  
+#ifndef GRANT_INVALID_REF

+/*
+ * Note on usage of grant reference 0 as invalid grant reference:
+ * grant reference 0 is valid, but never exposed to a driver,
+ * because of the fact it is already in use/reserved by the PV console.
+ */
+#define GRANT_INVALID_REF  0
+#endif
+
  struct gntdev_priv {
/* maps with visible offsets in the file descriptor */
struct list_head maps;
/* maps that are not visible; will be freed on munmap.
 * Only populated if populate_freeable_maps == 1 */
struct list_head freeable_maps;
+   /* List of dma-bufs. */
+   struct list_head dma_bufs;
/* lock protects maps and freeable_maps */
struct mutex lock;
struct mm_struct *mm;
struct mmu_notifier mn;
+
+   /* Private data of the hyper DMA buffers. */
+
+   struct device *dev;
+   /* List of exported DMA buffers. */
+   struct list_head dmabuf_exp_list;
+   /* List of wait objects. */
+   struct list_head dmabuf_exp_wait_list;
+   /* List of imported DMA buffers. */
+   struct list_head dmabuf_imp_list;
+   /* This is the lock which protects dma_buf_xxx lists. */
+   struct mutex dmabuf_lock;
  };
  
  struct unmap_notify {

@@ -95,10 +123,65 @@ struct grant_map {
struct gnttab_unmap_grant_ref *kunmap_ops;
struct page **pages;
unsigned long pages_vm_start;
+
+   /*
+* All the fields starting with dmabuf_ are only valid if this
+* mapping is used for exporting a DMA buffer.
+* If dmabuf_vaddr is not NULL then this mapping is backed by DMA
+* capable memory.
+*/
+
+   /* Flags used to create this DMA buffer: GNTDEV_DMABUF_FLAG_XXX. */
+   bool dmabuf_flags;
+   /* Virtual/CPU address of the DMA buffer. */
+   void *dmabuf_vaddr;
+   /* Bus address of the DMA buffer. */
+   dma_addr_t dmabuf_bus_addr;
+};
+
+struct hyper_dmabuf {
+   struct gntdev_priv *priv;
+   struct dma_buf *dmabuf;
+   struct list_head next;
+   int fd;
+
+   union {
+   struct {
+   /* Exported buffers are reference counted. */
+   struct kref refcount;
+   struct grant_map *map;
+   } exp;
+   struct {
+   /* Granted references of the imported buffer. */
+   grant_ref_t *refs;
+   /* Scatter-gather table of the imported buffer. */
+   struct sg_table *sgt;
+   /* dma-buf attachment of the imported buffer. */
+   struct dma_buf_attachment *attach;
+   } imp;
+   } u;
+
+   /* Number of pages this buffer has. */
+   int nr_pages;
+   /* Pages of this buffer. */
+   struct page **pages;
+};
+
+struct hyper_dmabuf_wait_obj {
+   struct list_head next;
+   struct hyper_dmabuf *hyper_dmabuf;
+   struct completion completion;
+};
+
+struct hyper_dambuf_attachment {

minor typo: dam->dma (same thing in other places as well.)

sure, thanks



+   struct sg_table *sgt;
+   enum dma_data_direction dir;
  };
  
  static int unmap_grant_pages(struct grant_map *map, int offset, int 

Re: [Xen-devel] [RFC 1/3] xen/balloon: Allow allocating DMA buffers

2018-05-21 Thread Oleksandr Andrushchenko

On 05/21/2018 11:36 PM, Boris Ostrovsky wrote:

On 05/21/2018 03:13 PM, Oleksandr Andrushchenko wrote:

On 05/21/2018 09:53 PM, Boris Ostrovsky wrote:

On 05/21/2018 01:32 PM, Oleksandr Andrushchenko wrote:

On 05/21/2018 07:35 PM, Boris Ostrovsky wrote:

On 05/21/2018 01:40 AM, Oleksandr Andrushchenko wrote:

On 05/19/2018 01:04 AM, Boris Ostrovsky wrote:

On 05/17/2018 04:26 AM, Oleksandr Andrushchenko wrote:

From: Oleksandr Andrushchenko 

A commit message would be useful.

Sure, v1 will have it

Signed-off-by: Oleksandr Andrushchenko


     for (i = 0; i < nr_pages; i++) {
-    page = alloc_page(gfp);
-    if (page == NULL) {
-    nr_pages = i;
-    state = BP_EAGAIN;
-    break;
+    if (ext_pages) {
+    page = ext_pages[i];
+    } else {
+    page = alloc_page(gfp);
+    if (page == NULL) {
+    nr_pages = i;
+    state = BP_EAGAIN;
+    break;
+    }
     }
     scrub_page(page);
     list_add(>lru, );
@@ -529,7 +565,7 @@ static enum bp_state
decrease_reservation(unsigned long nr_pages, gfp_t gfp)
     i = 0;
     list_for_each_entry_safe(page, tmp, , lru) {
     /* XENMEM_decrease_reservation requires a GFN */
-    frame_list[i++] = xen_page_to_gfn(page);
+    frames[i++] = xen_page_to_gfn(page);
       #ifdef CONFIG_XEN_HAVE_PVMMU
     /*
@@ -552,18 +588,22 @@ static enum bp_state
decrease_reservation(unsigned long nr_pages, gfp_t gfp)
     #endif
     list_del(>lru);
     -    balloon_append(page);
+    if (!ext_pages)
+    balloon_append(page);

So what you are proposing is not really ballooning. You are just
piggybacking on existing interfaces, aren't you?

Sort of. Basically I need to {increase|decrease}_reservation, not
actually
allocating ballooned pages.
Do you think I can simply EXPORT_SYMBOL for
{increase|decrease}_reservation?
Any other suggestion?

I am actually wondering how much of that code you end up reusing. You
pretty much create new code paths in both routines and common code
ends
up being essentially the hypercall.

Well, I hoped that it would be easier to maintain if I modify existing
code
to support both use-cases, but I am also ok to create new routines if
this
seems to be reasonable - please let me know

    So the question is --- would it make
sense to do all of this separately from the balloon driver?

This can be done, but which driver will host this code then? If we
move from
the balloon driver, then this could go to either gntdev or grant-table.
What's your preference?

A separate module?
Is there any use for this feature outside of your zero-copy DRM driver?

Intel's hyper dma-buf (Dongwon/Matt CC'ed), V4L/GPU at least.

At the time I tried to upstream zcopy driver it was discussed and
decided that
it would be better if I remove all DRM specific code and move it to
Xen drivers.
Thus, this RFC.

But it can also be implemented as a dedicated Xen dma-buf driver which
will have all the
code from this RFC + a bit more (char/misc device handling at least).
This will also require a dedicated user-space library, just like
libxengnttab.so
for gntdev (now I have all new IOCTLs covered there).

If the idea of a dedicated Xen dma-buf driver seems to be more
attractive we
can work toward this solution. BTW, I do support this idea, but was not
sure if Xen community accepts yet another driver which duplicates
quite some code
of the existing gntdev/balloon/grant-table. And now after this RFC I
hope that all cons
and pros of both dedicated driver and gntdev/balloon/grant-table
extension are
clearly seen and we can make a decision.


IIRC the objection for a separate module was in the context of gntdev
was discussion, because (among other things) people didn't want to have
yet another file in /dev/xen/

Here we are talking about (a new) balloon-like module which doesn't
create any new user-visible interfaces. And as for duplicating code ---
as I said, I am not convinced there is much of duplication.

I might even argue that we should add a new config option for this module.

I am not quite sure I am fully following you here: so, you suggest
that we have balloon.c unchanged, but instead create a new
module (namely a file under the same folder as balloon.c, e.g.
dma-buf-reservation.c) and move those {increase|decrease}_reservation
routines (specific to dma-buf) to that new file? And make it selectable
via Kconfig? If so, then how about the changes to grant-table and gntdev?
Those will look inconsistent then.

If you suggest a new kernel driver module:
IMO, there is nothing bad if we create a dedicated kernel module
(driver) for Xen dma-buf handling selectable under Kconfig option.
Yes, this will create a yet another device under /dev/xen,
but most people will never see it if we set Kconfig to default to "n".
And then we'll need 

Re: [Xen-devel] [RFC 1/3] xen/balloon: Allow allocating DMA buffers

2018-05-21 Thread Oleksandr Andrushchenko

On 05/21/2018 11:36 PM, Boris Ostrovsky wrote:

On 05/21/2018 03:13 PM, Oleksandr Andrushchenko wrote:

On 05/21/2018 09:53 PM, Boris Ostrovsky wrote:

On 05/21/2018 01:32 PM, Oleksandr Andrushchenko wrote:

On 05/21/2018 07:35 PM, Boris Ostrovsky wrote:

On 05/21/2018 01:40 AM, Oleksandr Andrushchenko wrote:

On 05/19/2018 01:04 AM, Boris Ostrovsky wrote:

On 05/17/2018 04:26 AM, Oleksandr Andrushchenko wrote:

From: Oleksandr Andrushchenko 

A commit message would be useful.

Sure, v1 will have it

Signed-off-by: Oleksandr Andrushchenko


     for (i = 0; i < nr_pages; i++) {
-    page = alloc_page(gfp);
-    if (page == NULL) {
-    nr_pages = i;
-    state = BP_EAGAIN;
-    break;
+    if (ext_pages) {
+    page = ext_pages[i];
+    } else {
+    page = alloc_page(gfp);
+    if (page == NULL) {
+    nr_pages = i;
+    state = BP_EAGAIN;
+    break;
+    }
     }
     scrub_page(page);
     list_add(>lru, );
@@ -529,7 +565,7 @@ static enum bp_state
decrease_reservation(unsigned long nr_pages, gfp_t gfp)
     i = 0;
     list_for_each_entry_safe(page, tmp, , lru) {
     /* XENMEM_decrease_reservation requires a GFN */
-    frame_list[i++] = xen_page_to_gfn(page);
+    frames[i++] = xen_page_to_gfn(page);
       #ifdef CONFIG_XEN_HAVE_PVMMU
     /*
@@ -552,18 +588,22 @@ static enum bp_state
decrease_reservation(unsigned long nr_pages, gfp_t gfp)
     #endif
     list_del(>lru);
     -    balloon_append(page);
+    if (!ext_pages)
+    balloon_append(page);

So what you are proposing is not really ballooning. You are just
piggybacking on existing interfaces, aren't you?

Sort of. Basically I need to {increase|decrease}_reservation, not
actually
allocating ballooned pages.
Do you think I can simply EXPORT_SYMBOL for
{increase|decrease}_reservation?
Any other suggestion?

I am actually wondering how much of that code you end up reusing. You
pretty much create new code paths in both routines and common code
ends
up being essentially the hypercall.

Well, I hoped that it would be easier to maintain if I modify existing
code
to support both use-cases, but I am also ok to create new routines if
this
seems to be reasonable - please let me know

    So the question is --- would it make
sense to do all of this separately from the balloon driver?

This can be done, but which driver will host this code then? If we
move from
the balloon driver, then this could go to either gntdev or grant-table.
What's your preference?

A separate module?
Is there any use for this feature outside of your zero-copy DRM driver?

Intel's hyper dma-buf (Dongwon/Matt CC'ed), V4L/GPU at least.

At the time I tried to upstream zcopy driver it was discussed and
decided that
it would be better if I remove all DRM specific code and move it to
Xen drivers.
Thus, this RFC.

But it can also be implemented as a dedicated Xen dma-buf driver which
will have all the
code from this RFC + a bit more (char/misc device handling at least).
This will also require a dedicated user-space library, just like
libxengnttab.so
for gntdev (now I have all new IOCTLs covered there).

If the idea of a dedicated Xen dma-buf driver seems to be more
attractive we
can work toward this solution. BTW, I do support this idea, but was not
sure if Xen community accepts yet another driver which duplicates
quite some code
of the existing gntdev/balloon/grant-table. And now after this RFC I
hope that all cons
and pros of both dedicated driver and gntdev/balloon/grant-table
extension are
clearly seen and we can make a decision.


IIRC the objection for a separate module was in the context of gntdev
was discussion, because (among other things) people didn't want to have
yet another file in /dev/xen/

Here we are talking about (a new) balloon-like module which doesn't
create any new user-visible interfaces. And as for duplicating code ---
as I said, I am not convinced there is much of duplication.

I might even argue that we should add a new config option for this module.

I am not quite sure I am fully following you here: so, you suggest
that we have balloon.c unchanged, but instead create a new
module (namely a file under the same folder as balloon.c, e.g.
dma-buf-reservation.c) and move those {increase|decrease}_reservation
routines (specific to dma-buf) to that new file? And make it selectable
via Kconfig? If so, then how about the changes to grant-table and gntdev?
Those will look inconsistent then.

If you suggest a new kernel driver module:
IMO, there is nothing bad if we create a dedicated kernel module
(driver) for Xen dma-buf handling selectable under Kconfig option.
Yes, this will create a yet another device under /dev/xen,
but most people will never see it if we set Kconfig to default to "n".
And then we'll need user-space support for that, so Xen tools will
be extended with 

Re: [PATCH net-next 6/7] net: dsa: qca8k: Replace GPL boilerplate by SPDX

2018-05-21 Thread Michal Vokáč

On 21.5.2018 17:20, Florian Fainelli wrote:


On 05/21/2018 06:28 AM, Michal Vokáč wrote:

Signed-off-by: Michal Vokáč 


Reviewed-by: Florian Fainelli 

I don't know if we need all people who contributed to that driver to
agree on that, this is not a license change, so it should be okay I presume?


I did that with the same idea on my mind, that this is not a license change.
Prior to the change I also went through the log and reviewed few commits that
did exactly the same thing and did not receive Ack or something from the
original authors nor further contributors.

Just one example:

4ac0d3f ("eeprom: at24: use SPDX identifier instead of GPL boiler-plate")

Thanks,
Michal


Re: [PATCH net-next 6/7] net: dsa: qca8k: Replace GPL boilerplate by SPDX

2018-05-21 Thread Michal Vokáč

On 21.5.2018 17:20, Florian Fainelli wrote:


On 05/21/2018 06:28 AM, Michal Vokáč wrote:

Signed-off-by: Michal Vokáč 


Reviewed-by: Florian Fainelli 

I don't know if we need all people who contributed to that driver to
agree on that, this is not a license change, so it should be okay I presume?


I did that with the same idea on my mind, that this is not a license change.
Prior to the change I also went through the log and reviewed few commits that
did exactly the same thing and did not receive Ack or something from the
original authors nor further contributors.

Just one example:

4ac0d3f ("eeprom: at24: use SPDX identifier instead of GPL boiler-plate")

Thanks,
Michal


Re: [PATCH net-next 7/7] net: dsa: qca8k: Remove rudundant parentheses

2018-05-21 Thread Michal Vokáč

On 21.5.2018 17:21, Florian Fainelli wrote:


On 05/21/2018 06:28 AM, Michal Vokáč wrote:

Fix warning reported by checkpatch.


Nit in the subject: should be redundant, with that:

Reviewed-by: Florian Fainelli 


Thanks, I will fix it.

Michal



Re: [PATCH net-next 7/7] net: dsa: qca8k: Remove rudundant parentheses

2018-05-21 Thread Michal Vokáč

On 21.5.2018 17:21, Florian Fainelli wrote:


On 05/21/2018 06:28 AM, Michal Vokáč wrote:

Fix warning reported by checkpatch.


Nit in the subject: should be redundant, with that:

Reviewed-by: Florian Fainelli 


Thanks, I will fix it.

Michal



Re: [PATCH v4 00/31] kconfig: move compiler capability tests to Kconfig

2018-05-21 Thread Masahiro Yamada
Hi reviewers,


2018-05-17 23:22 GMT+09:00 Masahiro Yamada :
> 2018-05-17 16:51 GMT+09:00 Nicholas Piggin :
>> On Thu, 17 May 2018 15:16:39 +0900
>> Masahiro Yamada  wrote:
>>
>>> [Introduction]
>>>
>>> The motivation of this work is to move the compiler option tests to
>>> Kconfig from Makefile.  A number of kernel features require the
>>> compiler support.  Enabling such features blindly in Kconfig ends up
>>> with a lot of nasty build-time testing in Makefiles.  If a chosen
>>> feature turns out unsupported by the compiler, what the build system
>>> can do is either to disable it (silently!) or to forcibly break the
>>> build, despite Kconfig has let the user to enable it.  By moving the
>>> compiler capability tests to Kconfig, Kconfig entries will be visible
>>> only when they are available.
>>>
>>> [Major Changes in V4]
>>
>> Do you have a git tree for v4? I can test it with the powerpc patches.
>>
>> The new scripting capability in kconfig has allowed us to already
>> improve the config process on powerpc:
>>
>> https://marc.info/?l=linuxppc-embedded=152648110727868=2
>>
>> I'm sure there's more clever things we can do with it but I haven't
>> had time to think about it yet. One thing that comes to mind is that
>> It might be nice to show the option as disabled, then the user could
>> upgrade their compiler to get the options they want.
>>
>> Anyway v3 worked fine for me, the documentation is really nice, I
>> could implement the above patch without any problem despite being a
>> kbuild dummy. Thanks for the series, ack from me.
>
>
> For easier review and test,
> I pushed v4 to the following branch:
>
>
> git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git
> kconfig-shell-v4
>



Planned change for v5
-

I am thinking of more simplification for v5.
This is my thought.


V4 supported the _lazy_ expansion and 'if' function.


If you have ever looked into GNU Make source code,
you may know some functions lazily expand arguments.

https://git.savannah.gnu.org/cgit/make.git/tree/src/function.c#n2339


"foreach", "if", "or", "and" are specified with "EXP? = 0".

Those 4 functions expand arguments as-needed.

For example,

$(if $(cond),$(yes-case),$(no-case))


The 'if' function receive arguments before expansion,
then expands $(cond) first.

$(yes-case) is expanded only when $(cond)
turns out to be a non-empty string.
In this case, $(no-case) is not evaluated at all.


$(error msg), if it is evaluated,
immediately terminates Makefile parsing



That's why $(error ) in Makefile are generally used
in the either of the following forms:

ifeq ...
  $(error blah blah)
endif

  OR

$(if $(cond),$(error blah blah))





This does not work in user-defined functions.

$(call my-func,$(cond),$(error blah blah))


This always terminates parsing regardless of $(cond).

Because the 'call' function is specified with "EXP? = 1",
arguments are expanded (i.e. evaluated)
before they are passed to the 'call' function.


Let's say, you implemented your helper macros based on the
'if' built-in function, like follows:

# return $(2) or $(3), depending on the gcc version.
gcc-ifversion = $(if $(shell test $(gcc-version) -ge $(1) && echo y),$(2),$(3))


In this case, both $(2) and $(3) are evaluated
before the condition part is checked.
So, we end up with unneeded calculation
where we know $(2) and $(3) are exclusively used.


We can exploit the laziness
only when we use the 'if' function in raw form.
This use cases are limited.

We use helper macros anyway,
and the advantage of the lazy expansion disappear
in user-defined functions.


After consideration, I am reluctant to implement the lazy expansion
(at least until we find it is necessary).

To keep the source code simple,
I do not want to implement what we may or may not need.
At this moment, I am not so sure whether it is useful.


So, here is a planned change in v5:

Kconfig expands arguments before passing them to a function.

This means,
$(error blah blah) would never be useful.


I will remove "error" and "warning" built-in functions.

Instead, I will add "error-on" and "warning-on" built-in functions.

 -  $(error-on,,)

   If  contains any character,
   it terminates file parsing, showing  to stderr.

 -  $(warning-on,,)

   If  contains any character,
   show  to stderr.


I will remove 'if' function too.
If we give up its laziness, we can implement it as
a user-defined function.




-- 
Best Regards
Masahiro Yamada


Re: [PATCH v4 00/31] kconfig: move compiler capability tests to Kconfig

2018-05-21 Thread Masahiro Yamada
Hi reviewers,


2018-05-17 23:22 GMT+09:00 Masahiro Yamada :
> 2018-05-17 16:51 GMT+09:00 Nicholas Piggin :
>> On Thu, 17 May 2018 15:16:39 +0900
>> Masahiro Yamada  wrote:
>>
>>> [Introduction]
>>>
>>> The motivation of this work is to move the compiler option tests to
>>> Kconfig from Makefile.  A number of kernel features require the
>>> compiler support.  Enabling such features blindly in Kconfig ends up
>>> with a lot of nasty build-time testing in Makefiles.  If a chosen
>>> feature turns out unsupported by the compiler, what the build system
>>> can do is either to disable it (silently!) or to forcibly break the
>>> build, despite Kconfig has let the user to enable it.  By moving the
>>> compiler capability tests to Kconfig, Kconfig entries will be visible
>>> only when they are available.
>>>
>>> [Major Changes in V4]
>>
>> Do you have a git tree for v4? I can test it with the powerpc patches.
>>
>> The new scripting capability in kconfig has allowed us to already
>> improve the config process on powerpc:
>>
>> https://marc.info/?l=linuxppc-embedded=152648110727868=2
>>
>> I'm sure there's more clever things we can do with it but I haven't
>> had time to think about it yet. One thing that comes to mind is that
>> It might be nice to show the option as disabled, then the user could
>> upgrade their compiler to get the options they want.
>>
>> Anyway v3 worked fine for me, the documentation is really nice, I
>> could implement the above patch without any problem despite being a
>> kbuild dummy. Thanks for the series, ack from me.
>
>
> For easier review and test,
> I pushed v4 to the following branch:
>
>
> git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git
> kconfig-shell-v4
>



Planned change for v5
-

I am thinking of more simplification for v5.
This is my thought.


V4 supported the _lazy_ expansion and 'if' function.


If you have ever looked into GNU Make source code,
you may know some functions lazily expand arguments.

https://git.savannah.gnu.org/cgit/make.git/tree/src/function.c#n2339


"foreach", "if", "or", "and" are specified with "EXP? = 0".

Those 4 functions expand arguments as-needed.

For example,

$(if $(cond),$(yes-case),$(no-case))


The 'if' function receive arguments before expansion,
then expands $(cond) first.

$(yes-case) is expanded only when $(cond)
turns out to be a non-empty string.
In this case, $(no-case) is not evaluated at all.


$(error msg), if it is evaluated,
immediately terminates Makefile parsing



That's why $(error ) in Makefile are generally used
in the either of the following forms:

ifeq ...
  $(error blah blah)
endif

  OR

$(if $(cond),$(error blah blah))





This does not work in user-defined functions.

$(call my-func,$(cond),$(error blah blah))


This always terminates parsing regardless of $(cond).

Because the 'call' function is specified with "EXP? = 1",
arguments are expanded (i.e. evaluated)
before they are passed to the 'call' function.


Let's say, you implemented your helper macros based on the
'if' built-in function, like follows:

# return $(2) or $(3), depending on the gcc version.
gcc-ifversion = $(if $(shell test $(gcc-version) -ge $(1) && echo y),$(2),$(3))


In this case, both $(2) and $(3) are evaluated
before the condition part is checked.
So, we end up with unneeded calculation
where we know $(2) and $(3) are exclusively used.


We can exploit the laziness
only when we use the 'if' function in raw form.
This use cases are limited.

We use helper macros anyway,
and the advantage of the lazy expansion disappear
in user-defined functions.


After consideration, I am reluctant to implement the lazy expansion
(at least until we find it is necessary).

To keep the source code simple,
I do not want to implement what we may or may not need.
At this moment, I am not so sure whether it is useful.


So, here is a planned change in v5:

Kconfig expands arguments before passing them to a function.

This means,
$(error blah blah) would never be useful.


I will remove "error" and "warning" built-in functions.

Instead, I will add "error-on" and "warning-on" built-in functions.

 -  $(error-on,,)

   If  contains any character,
   it terminates file parsing, showing  to stderr.

 -  $(warning-on,,)

   If  contains any character,
   show  to stderr.


I will remove 'if' function too.
If we give up its laziness, we can implement it as
a user-defined function.




-- 
Best Regards
Masahiro Yamada


[PATCH v2] powerpc/32: Implement csum_ipv6_magic in assembly

2018-05-21 Thread Christophe Leroy
The generic csum_ipv6_magic() generates a pretty bad result

 :
   0:   81 23 00 00 lwz r9,0(r3)
   4:   81 03 00 04 lwz r8,4(r3)
   8:   7c e7 4a 14 add r7,r7,r9
   c:   7d 29 38 10 subfc   r9,r9,r7
  10:   7d 4a 51 10 subfe   r10,r10,r10
  14:   7d 27 42 14 add r9,r7,r8
  18:   7d 2a 48 50 subfr9,r10,r9
  1c:   80 e3 00 08 lwz r7,8(r3)
  20:   7d 08 48 10 subfc   r8,r8,r9
  24:   7d 4a 51 10 subfe   r10,r10,r10
  28:   7d 29 3a 14 add r9,r9,r7
  2c:   81 03 00 0c lwz r8,12(r3)
  30:   7d 2a 48 50 subfr9,r10,r9
  34:   7c e7 48 10 subfc   r7,r7,r9
  38:   7d 4a 51 10 subfe   r10,r10,r10
  3c:   7d 29 42 14 add r9,r9,r8
  40:   7d 2a 48 50 subfr9,r10,r9
  44:   80 e4 00 00 lwz r7,0(r4)
  48:   7d 08 48 10 subfc   r8,r8,r9
  4c:   7d 4a 51 10 subfe   r10,r10,r10
  50:   7d 29 3a 14 add r9,r9,r7
  54:   7d 2a 48 50 subfr9,r10,r9
  58:   81 04 00 04 lwz r8,4(r4)
  5c:   7c e7 48 10 subfc   r7,r7,r9
  60:   7d 4a 51 10 subfe   r10,r10,r10
  64:   7d 29 42 14 add r9,r9,r8
  68:   7d 2a 48 50 subfr9,r10,r9
  6c:   80 e4 00 08 lwz r7,8(r4)
  70:   7d 08 48 10 subfc   r8,r8,r9
  74:   7d 4a 51 10 subfe   r10,r10,r10
  78:   7d 29 3a 14 add r9,r9,r7
  7c:   7d 2a 48 50 subfr9,r10,r9
  80:   81 04 00 0c lwz r8,12(r4)
  84:   7c e7 48 10 subfc   r7,r7,r9
  88:   7d 4a 51 10 subfe   r10,r10,r10
  8c:   7d 29 42 14 add r9,r9,r8
  90:   7d 2a 48 50 subfr9,r10,r9
  94:   7d 08 48 10 subfc   r8,r8,r9
  98:   7d 4a 51 10 subfe   r10,r10,r10
  9c:   7d 29 2a 14 add r9,r9,r5
  a0:   7d 2a 48 50 subfr9,r10,r9
  a4:   7c a5 48 10 subfc   r5,r5,r9
  a8:   7c 63 19 10 subfe   r3,r3,r3
  ac:   7d 29 32 14 add r9,r9,r6
  b0:   7d 23 48 50 subfr9,r3,r9
  b4:   7c c6 48 10 subfc   r6,r6,r9
  b8:   7c 63 19 10 subfe   r3,r3,r3
  bc:   7c 63 48 50 subfr3,r3,r9
  c0:   54 6a 80 3e rotlwi  r10,r3,16
  c4:   7c 63 52 14 add r3,r3,r10
  c8:   7c 63 18 f8 not r3,r3
  cc:   54 63 84 3e rlwinm  r3,r3,16,16,31
  d0:   4e 80 00 20 blr

This patch implements it in assembly for PPC32

Link: https://github.com/linuxppc/linux/issues/9
Signed-off-by: Christophe Leroy 
---
 v2: Fix number of args in final addze

 arch/powerpc/include/asm/checksum.h |  8 
 arch/powerpc/lib/checksum_32.S  | 33 +
 2 files changed, 41 insertions(+)

diff --git a/arch/powerpc/include/asm/checksum.h 
b/arch/powerpc/include/asm/checksum.h
index 54065caa40b3..c41c280c252f 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -13,6 +13,7 @@
 #include 
 #else
 #include 
+#include 
 /*
  * Computes the checksum of a memory block at src, length len,
  * and adds in "sum" (32-bit), while copying the block to dst.
@@ -211,6 +212,13 @@ static inline __sum16 ip_compute_csum(const void *buff, 
int len)
return csum_fold(csum_partial(buff, len, 0));
 }
 
+#ifdef CONFIG_PPC32
+#define _HAVE_ARCH_IPV6_CSUM
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+   const struct in6_addr *daddr,
+   __u32 len, __u8 proto, __wsum sum);
+#endif
+
 #endif
 #endif /* __KERNEL__ */
 #endif
diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 9a671c774b22..9167ab088f04 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -293,3 +293,36 @@ dst_error:
EX_TABLE(51b, dst_error);
 
 EXPORT_SYMBOL(csum_partial_copy_generic)
+
+/*
+ * static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ *   const struct in6_addr *daddr,
+ *   __u32 len, __u8 proto, __wsum sum)
+ */
+
+_GLOBAL(csum_ipv6_magic)
+   lwz r8, 0(r3)
+   lwz r9, 4(r3)
+   lwz r10, 8(r3)
+   lwz r11, 12(r3)
+   addcr0, r5, r6
+   adder0, r0, r7
+   adder0, r0, r8
+   adder0, r0, r9
+   adder0, r0, r10
+   adder0, r0, r11
+   lwz r8, 0(r4)
+   lwz r9, 4(r4)
+   lwz r10, 8(r4)
+   lwz r11, 12(r4)
+   adder0, r0, r8
+   adder0, r0, r9
+   adder0, r0, r10
+   adder0, r0, r11
+   addze   r0, r0
+   rotlwi  r3, r0, 16
+   add r3, r0, r3
+   not r3, r3
+   rlwinm  r3, r3, 16, 16, 31
+   blr
+EXPORT_SYMBOL(csum_ipv6_magic)
-- 
2.13.3



[PATCH v2] powerpc/32: Implement csum_ipv6_magic in assembly

2018-05-21 Thread Christophe Leroy
The generic csum_ipv6_magic() generates a pretty bad result

 :
   0:   81 23 00 00 lwz r9,0(r3)
   4:   81 03 00 04 lwz r8,4(r3)
   8:   7c e7 4a 14 add r7,r7,r9
   c:   7d 29 38 10 subfc   r9,r9,r7
  10:   7d 4a 51 10 subfe   r10,r10,r10
  14:   7d 27 42 14 add r9,r7,r8
  18:   7d 2a 48 50 subfr9,r10,r9
  1c:   80 e3 00 08 lwz r7,8(r3)
  20:   7d 08 48 10 subfc   r8,r8,r9
  24:   7d 4a 51 10 subfe   r10,r10,r10
  28:   7d 29 3a 14 add r9,r9,r7
  2c:   81 03 00 0c lwz r8,12(r3)
  30:   7d 2a 48 50 subfr9,r10,r9
  34:   7c e7 48 10 subfc   r7,r7,r9
  38:   7d 4a 51 10 subfe   r10,r10,r10
  3c:   7d 29 42 14 add r9,r9,r8
  40:   7d 2a 48 50 subfr9,r10,r9
  44:   80 e4 00 00 lwz r7,0(r4)
  48:   7d 08 48 10 subfc   r8,r8,r9
  4c:   7d 4a 51 10 subfe   r10,r10,r10
  50:   7d 29 3a 14 add r9,r9,r7
  54:   7d 2a 48 50 subfr9,r10,r9
  58:   81 04 00 04 lwz r8,4(r4)
  5c:   7c e7 48 10 subfc   r7,r7,r9
  60:   7d 4a 51 10 subfe   r10,r10,r10
  64:   7d 29 42 14 add r9,r9,r8
  68:   7d 2a 48 50 subfr9,r10,r9
  6c:   80 e4 00 08 lwz r7,8(r4)
  70:   7d 08 48 10 subfc   r8,r8,r9
  74:   7d 4a 51 10 subfe   r10,r10,r10
  78:   7d 29 3a 14 add r9,r9,r7
  7c:   7d 2a 48 50 subfr9,r10,r9
  80:   81 04 00 0c lwz r8,12(r4)
  84:   7c e7 48 10 subfc   r7,r7,r9
  88:   7d 4a 51 10 subfe   r10,r10,r10
  8c:   7d 29 42 14 add r9,r9,r8
  90:   7d 2a 48 50 subfr9,r10,r9
  94:   7d 08 48 10 subfc   r8,r8,r9
  98:   7d 4a 51 10 subfe   r10,r10,r10
  9c:   7d 29 2a 14 add r9,r9,r5
  a0:   7d 2a 48 50 subfr9,r10,r9
  a4:   7c a5 48 10 subfc   r5,r5,r9
  a8:   7c 63 19 10 subfe   r3,r3,r3
  ac:   7d 29 32 14 add r9,r9,r6
  b0:   7d 23 48 50 subfr9,r3,r9
  b4:   7c c6 48 10 subfc   r6,r6,r9
  b8:   7c 63 19 10 subfe   r3,r3,r3
  bc:   7c 63 48 50 subfr3,r3,r9
  c0:   54 6a 80 3e rotlwi  r10,r3,16
  c4:   7c 63 52 14 add r3,r3,r10
  c8:   7c 63 18 f8 not r3,r3
  cc:   54 63 84 3e rlwinm  r3,r3,16,16,31
  d0:   4e 80 00 20 blr

This patch implements it in assembly for PPC32

Link: https://github.com/linuxppc/linux/issues/9
Signed-off-by: Christophe Leroy 
---
 v2: Fix number of args in final addze

 arch/powerpc/include/asm/checksum.h |  8 
 arch/powerpc/lib/checksum_32.S  | 33 +
 2 files changed, 41 insertions(+)

diff --git a/arch/powerpc/include/asm/checksum.h 
b/arch/powerpc/include/asm/checksum.h
index 54065caa40b3..c41c280c252f 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -13,6 +13,7 @@
 #include 
 #else
 #include 
+#include 
 /*
  * Computes the checksum of a memory block at src, length len,
  * and adds in "sum" (32-bit), while copying the block to dst.
@@ -211,6 +212,13 @@ static inline __sum16 ip_compute_csum(const void *buff, 
int len)
return csum_fold(csum_partial(buff, len, 0));
 }
 
+#ifdef CONFIG_PPC32
+#define _HAVE_ARCH_IPV6_CSUM
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+   const struct in6_addr *daddr,
+   __u32 len, __u8 proto, __wsum sum);
+#endif
+
 #endif
 #endif /* __KERNEL__ */
 #endif
diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 9a671c774b22..9167ab088f04 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -293,3 +293,36 @@ dst_error:
EX_TABLE(51b, dst_error);
 
 EXPORT_SYMBOL(csum_partial_copy_generic)
+
+/*
+ * static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ *   const struct in6_addr *daddr,
+ *   __u32 len, __u8 proto, __wsum sum)
+ */
+
+_GLOBAL(csum_ipv6_magic)
+   lwz r8, 0(r3)
+   lwz r9, 4(r3)
+   lwz r10, 8(r3)
+   lwz r11, 12(r3)
+   addcr0, r5, r6
+   adder0, r0, r7
+   adder0, r0, r8
+   adder0, r0, r9
+   adder0, r0, r10
+   adder0, r0, r11
+   lwz r8, 0(r4)
+   lwz r9, 4(r4)
+   lwz r10, 8(r4)
+   lwz r11, 12(r4)
+   adder0, r0, r8
+   adder0, r0, r9
+   adder0, r0, r10
+   adder0, r0, r11
+   addze   r0, r0
+   rotlwi  r3, r0, 16
+   add r3, r0, r3
+   not r3, r3
+   rlwinm  r3, r3, 16, 16, 31
+   blr
+EXPORT_SYMBOL(csum_ipv6_magic)
-- 
2.13.3



[PATCH 4.16 031/110] i2c: designware: fix poll-after-enable regression

2018-05-21 Thread Greg Kroah-Hartman
4.16-stable review patch.  If anyone has any objections, please let me know.

--

From: Alexander Monakov 

commit 06cb616b1bca7080824acfedb3d4c898e7a64836 upstream.

Not all revisions of DW I2C controller implement the enable status register.
On platforms where that's the case (e.g. BG2CD and SPEAr ARM SoCs), waiting
for enable will time out as reading the unimplemented register yields zero.

It was observed that reading the IC_ENABLE_STATUS register once suffices to
avoid getting it stuck on Bay Trail hardware, so replace polling with one
dummy read of the register.

Fixes: fba4adbbf670 ("i2c: designware: must wait for enable")
Signed-off-by: Alexander Monakov 
Tested-by: Ben Gardner 
Acked-by: Jarkko Nikula 
Signed-off-by: Wolfram Sang 
Cc: sta...@kernel.org
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/i2c/busses/i2c-designware-master.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

--- a/drivers/i2c/busses/i2c-designware-master.c
+++ b/drivers/i2c/busses/i2c-designware-master.c
@@ -209,7 +209,10 @@ static void i2c_dw_xfer_init(struct dw_i
i2c_dw_disable_int(dev);
 
/* Enable the adapter */
-   __i2c_dw_enable_and_wait(dev, true);
+   __i2c_dw_enable(dev, true);
+
+   /* Dummy read to avoid the register getting stuck on Bay Trail */
+   dw_readl(dev, DW_IC_ENABLE_STATUS);
 
/* Clear and enable interrupts */
dw_readl(dev, DW_IC_CLR_INTR);




[PATCH 4.16 031/110] i2c: designware: fix poll-after-enable regression

2018-05-21 Thread Greg Kroah-Hartman
4.16-stable review patch.  If anyone has any objections, please let me know.

--

From: Alexander Monakov 

commit 06cb616b1bca7080824acfedb3d4c898e7a64836 upstream.

Not all revisions of DW I2C controller implement the enable status register.
On platforms where that's the case (e.g. BG2CD and SPEAr ARM SoCs), waiting
for enable will time out as reading the unimplemented register yields zero.

It was observed that reading the IC_ENABLE_STATUS register once suffices to
avoid getting it stuck on Bay Trail hardware, so replace polling with one
dummy read of the register.

Fixes: fba4adbbf670 ("i2c: designware: must wait for enable")
Signed-off-by: Alexander Monakov 
Tested-by: Ben Gardner 
Acked-by: Jarkko Nikula 
Signed-off-by: Wolfram Sang 
Cc: sta...@kernel.org
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/i2c/busses/i2c-designware-master.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

--- a/drivers/i2c/busses/i2c-designware-master.c
+++ b/drivers/i2c/busses/i2c-designware-master.c
@@ -209,7 +209,10 @@ static void i2c_dw_xfer_init(struct dw_i
i2c_dw_disable_int(dev);
 
/* Enable the adapter */
-   __i2c_dw_enable_and_wait(dev, true);
+   __i2c_dw_enable(dev, true);
+
+   /* Dummy read to avoid the register getting stuck on Bay Trail */
+   dw_readl(dev, DW_IC_ENABLE_STATUS);
 
/* Clear and enable interrupts */
dw_readl(dev, DW_IC_CLR_INTR);




[PATCH 4.14 21/95] i2c: designware: fix poll-after-enable regression

2018-05-21 Thread Greg Kroah-Hartman
4.14-stable review patch.  If anyone has any objections, please let me know.

--

From: Alexander Monakov 

commit 06cb616b1bca7080824acfedb3d4c898e7a64836 upstream.

Not all revisions of DW I2C controller implement the enable status register.
On platforms where that's the case (e.g. BG2CD and SPEAr ARM SoCs), waiting
for enable will time out as reading the unimplemented register yields zero.

It was observed that reading the IC_ENABLE_STATUS register once suffices to
avoid getting it stuck on Bay Trail hardware, so replace polling with one
dummy read of the register.

Fixes: fba4adbbf670 ("i2c: designware: must wait for enable")
Signed-off-by: Alexander Monakov 
Tested-by: Ben Gardner 
Acked-by: Jarkko Nikula 
Signed-off-by: Wolfram Sang 
Cc: sta...@kernel.org
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/i2c/busses/i2c-designware-master.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

--- a/drivers/i2c/busses/i2c-designware-master.c
+++ b/drivers/i2c/busses/i2c-designware-master.c
@@ -207,7 +207,10 @@ static void i2c_dw_xfer_init(struct dw_i
i2c_dw_disable_int(dev);
 
/* Enable the adapter */
-   __i2c_dw_enable_and_wait(dev, true);
+   __i2c_dw_enable(dev, true);
+
+   /* Dummy read to avoid the register getting stuck on Bay Trail */
+   dw_readl(dev, DW_IC_ENABLE_STATUS);
 
/* Clear and enable interrupts */
dw_readl(dev, DW_IC_CLR_INTR);




[PATCH 4.14 21/95] i2c: designware: fix poll-after-enable regression

2018-05-21 Thread Greg Kroah-Hartman
4.14-stable review patch.  If anyone has any objections, please let me know.

--

From: Alexander Monakov 

commit 06cb616b1bca7080824acfedb3d4c898e7a64836 upstream.

Not all revisions of DW I2C controller implement the enable status register.
On platforms where that's the case (e.g. BG2CD and SPEAr ARM SoCs), waiting
for enable will time out as reading the unimplemented register yields zero.

It was observed that reading the IC_ENABLE_STATUS register once suffices to
avoid getting it stuck on Bay Trail hardware, so replace polling with one
dummy read of the register.

Fixes: fba4adbbf670 ("i2c: designware: must wait for enable")
Signed-off-by: Alexander Monakov 
Tested-by: Ben Gardner 
Acked-by: Jarkko Nikula 
Signed-off-by: Wolfram Sang 
Cc: sta...@kernel.org
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/i2c/busses/i2c-designware-master.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

--- a/drivers/i2c/busses/i2c-designware-master.c
+++ b/drivers/i2c/busses/i2c-designware-master.c
@@ -207,7 +207,10 @@ static void i2c_dw_xfer_init(struct dw_i
i2c_dw_disable_int(dev);
 
/* Enable the adapter */
-   __i2c_dw_enable_and_wait(dev, true);
+   __i2c_dw_enable(dev, true);
+
+   /* Dummy read to avoid the register getting stuck on Bay Trail */
+   dw_readl(dev, DW_IC_ENABLE_STATUS);
 
/* Clear and enable interrupts */
dw_readl(dev, DW_IC_CLR_INTR);




[PATCH 4.9 17/87] i2c: designware: fix poll-after-enable regression

2018-05-21 Thread Greg Kroah-Hartman
4.9-stable review patch.  If anyone has any objections, please let me know.

--

From: Alexander Monakov 

commit 06cb616b1bca7080824acfedb3d4c898e7a64836 upstream.

Not all revisions of DW I2C controller implement the enable status register.
On platforms where that's the case (e.g. BG2CD and SPEAr ARM SoCs), waiting
for enable will time out as reading the unimplemented register yields zero.

It was observed that reading the IC_ENABLE_STATUS register once suffices to
avoid getting it stuck on Bay Trail hardware, so replace polling with one
dummy read of the register.

Fixes: fba4adbbf670 ("i2c: designware: must wait for enable")
Signed-off-by: Alexander Monakov 
Tested-by: Ben Gardner 
Acked-by: Jarkko Nikula 
Signed-off-by: Wolfram Sang 
Cc: sta...@kernel.org
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/i2c/busses/i2c-designware-core.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

--- a/drivers/i2c/busses/i2c-designware-core.c
+++ b/drivers/i2c/busses/i2c-designware-core.c
@@ -507,7 +507,10 @@ static void i2c_dw_xfer_init(struct dw_i
i2c_dw_disable_int(dev);
 
/* Enable the adapter */
-   __i2c_dw_enable_and_wait(dev, true);
+   __i2c_dw_enable(dev, true);
+
+   /* Dummy read to avoid the register getting stuck on Bay Trail */
+   dw_readl(dev, DW_IC_ENABLE_STATUS);
 
/* Clear and enable interrupts */
dw_readl(dev, DW_IC_CLR_INTR);




[PATCH 4.9 17/87] i2c: designware: fix poll-after-enable regression

2018-05-21 Thread Greg Kroah-Hartman
4.9-stable review patch.  If anyone has any objections, please let me know.

--

From: Alexander Monakov 

commit 06cb616b1bca7080824acfedb3d4c898e7a64836 upstream.

Not all revisions of DW I2C controller implement the enable status register.
On platforms where that's the case (e.g. BG2CD and SPEAr ARM SoCs), waiting
for enable will time out as reading the unimplemented register yields zero.

It was observed that reading the IC_ENABLE_STATUS register once suffices to
avoid getting it stuck on Bay Trail hardware, so replace polling with one
dummy read of the register.

Fixes: fba4adbbf670 ("i2c: designware: must wait for enable")
Signed-off-by: Alexander Monakov 
Tested-by: Ben Gardner 
Acked-by: Jarkko Nikula 
Signed-off-by: Wolfram Sang 
Cc: sta...@kernel.org
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/i2c/busses/i2c-designware-core.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

--- a/drivers/i2c/busses/i2c-designware-core.c
+++ b/drivers/i2c/busses/i2c-designware-core.c
@@ -507,7 +507,10 @@ static void i2c_dw_xfer_init(struct dw_i
i2c_dw_disable_int(dev);
 
/* Enable the adapter */
-   __i2c_dw_enable_and_wait(dev, true);
+   __i2c_dw_enable(dev, true);
+
+   /* Dummy read to avoid the register getting stuck on Bay Trail */
+   dw_readl(dev, DW_IC_ENABLE_STATUS);
 
/* Clear and enable interrupts */
dw_readl(dev, DW_IC_CLR_INTR);




Re: [PATCH net-next 4/7] net: dsa: qca8k: Force CPU port to its highest bandwidth

2018-05-21 Thread Michal Vokáč

On 21.5.2018 17:19, Florian Fainelli wrote:


On 05/21/2018 06:28 AM, Michal Vokáč wrote:

By default autonegotiation is enabled to configure MAC on all ports.
For the CPU port autonegotiation can not be used so we need to set
some sensible defaults manually.

This patch forces the default setting of the CPU port to 1000Mbps/full
duplex which is the chip maximum capability.

Also correct size of the bit field used to configure link speed.

Signed-off-by: Michal Vokáč 


Reviewed-by: Florian Fainelli 

Likewise, would not we want to have a:

Fixes: 6b93fb46480a ("net-next: dsa: add new driver for qca8xxx family")

tag here as well?


Good point, I will add this in v2.

Thanks,
Michal


Re: [PATCH net-next 4/7] net: dsa: qca8k: Force CPU port to its highest bandwidth

2018-05-21 Thread Michal Vokáč

On 21.5.2018 17:19, Florian Fainelli wrote:


On 05/21/2018 06:28 AM, Michal Vokáč wrote:

By default autonegotiation is enabled to configure MAC on all ports.
For the CPU port autonegotiation can not be used so we need to set
some sensible defaults manually.

This patch forces the default setting of the CPU port to 1000Mbps/full
duplex which is the chip maximum capability.

Also correct size of the bit field used to configure link speed.

Signed-off-by: Michal Vokáč 


Reviewed-by: Florian Fainelli 

Likewise, would not we want to have a:

Fixes: 6b93fb46480a ("net-next: dsa: add new driver for qca8xxx family")

tag here as well?


Good point, I will add this in v2.

Thanks,
Michal


Re: [PATCH net-next 3/7] net: dsa: qca8k: Enable RXMAC when bringing up a port

2018-05-21 Thread Michal Vokáč

On 21.5.2018 17:17, Florian Fainelli wrote:


On 05/21/2018 06:28 AM, Michal Vokáč wrote:

When a port is brought up/down do not enable/disable only the TXMAC
but the RXMAC as well. This is essential for the CPU port to work.

Signed-off-by: Michal Vokáč 


Reviewed-by: Florian Fainelli 

Should this have:

Fixes: 6b93fb46480a ("net-next: dsa: add new driver for qca8xxx family")?


Yes, I think it is a good idea to can add this. Will do it in v2.

Thanks,
Michal


Re: [PATCH net-next 3/7] net: dsa: qca8k: Enable RXMAC when bringing up a port

2018-05-21 Thread Michal Vokáč

On 21.5.2018 17:17, Florian Fainelli wrote:


On 05/21/2018 06:28 AM, Michal Vokáč wrote:

When a port is brought up/down do not enable/disable only the TXMAC
but the RXMAC as well. This is essential for the CPU port to work.

Signed-off-by: Michal Vokáč 


Reviewed-by: Florian Fainelli 

Should this have:

Fixes: 6b93fb46480a ("net-next: dsa: add new driver for qca8xxx family")?


Yes, I think it is a good idea to can add this. Will do it in v2.

Thanks,
Michal


Re: [PATCH v3 4/6] ALSA: xen-front: Implement handling of shared buffers

2018-05-21 Thread Oleksandr Andrushchenko

On 05/21/2018 11:26 PM, Takashi Iwai wrote:

On Thu, 17 May 2018 08:26:16 +0200,
Takashi Iwai wrote:

On Tue, 15 May 2018 08:02:08 +0200,
Oleksandr Andrushchenko wrote:

On 05/15/2018 09:01 AM, Takashi Iwai wrote:

On Tue, 15 May 2018 07:46:38 +0200,
Oleksandr Andrushchenko wrote:

On 05/14/2018 11:28 PM, Takashi Iwai wrote:

On Mon, 14 May 2018 08:27:40 +0200,
Oleksandr Andrushchenko wrote:

--- /dev/null
+++ b/sound/xen/xen_snd_front_shbuf.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+
+/*
+ * Xen para-virtual sound device
+ *
+ * Copyright (C) 2016-2018 EPAM Systems Inc.
+ *
+ * Author: Oleksandr Andrushchenko 
+ */
+
+#include 
+#include 
+
+#include "xen_snd_front_shbuf.h"

Hm, with the local build test, I get the following error:

 CC [M]  sound/xen/xen_snd_front_shbuf.o
 In file included from sound/xen/xen_snd_front_shbuf.c:11:0:
 ./include/xen/xen.h:18:8: error: unknown type name ‘bool’
  extern bool xen_pvh;
  ^~~~
  In file included from ./include/xen/interface/xen.h:30:0,
   from ./include/xen/xen.h:29,
   from sound/xen/xen_snd_front_shbuf.c:11:
 ./arch/x86/include/asm/xen/interface.h:92:21: error: unknown type name 
‘uint64_t’
  DEFINE_GUEST_HANDLE(uint64_t);
  ^

Adding #include  fixed the issue.

Did you really test your patches with the latest Linus tree?

My bad, it does build for ARM (which is my target), but also does
need "#include " for x86 which I didn't build this time.
Sorry about that.

Do you want me to resend this single patch or you can make the change
while applying?

Yes, it's fine.

Thank you

FWIW, the patches are in topic/xen branch in sound.git tree, and I'll
keep boiling for a while to see if any issues are caught by 0day bot.

... and now the topic/xen branch got merged to for-next, targeted for
4.18.

Thanks for your patient works!


Takashi

Great news, thank you,
Oleksandr


Re: [PATCH v3 4/6] ALSA: xen-front: Implement handling of shared buffers

2018-05-21 Thread Oleksandr Andrushchenko

On 05/21/2018 11:26 PM, Takashi Iwai wrote:

On Thu, 17 May 2018 08:26:16 +0200,
Takashi Iwai wrote:

On Tue, 15 May 2018 08:02:08 +0200,
Oleksandr Andrushchenko wrote:

On 05/15/2018 09:01 AM, Takashi Iwai wrote:

On Tue, 15 May 2018 07:46:38 +0200,
Oleksandr Andrushchenko wrote:

On 05/14/2018 11:28 PM, Takashi Iwai wrote:

On Mon, 14 May 2018 08:27:40 +0200,
Oleksandr Andrushchenko wrote:

--- /dev/null
+++ b/sound/xen/xen_snd_front_shbuf.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+
+/*
+ * Xen para-virtual sound device
+ *
+ * Copyright (C) 2016-2018 EPAM Systems Inc.
+ *
+ * Author: Oleksandr Andrushchenko 
+ */
+
+#include 
+#include 
+
+#include "xen_snd_front_shbuf.h"

Hm, with the local build test, I get the following error:

 CC [M]  sound/xen/xen_snd_front_shbuf.o
 In file included from sound/xen/xen_snd_front_shbuf.c:11:0:
 ./include/xen/xen.h:18:8: error: unknown type name ‘bool’
  extern bool xen_pvh;
  ^~~~
  In file included from ./include/xen/interface/xen.h:30:0,
   from ./include/xen/xen.h:29,
   from sound/xen/xen_snd_front_shbuf.c:11:
 ./arch/x86/include/asm/xen/interface.h:92:21: error: unknown type name 
‘uint64_t’
  DEFINE_GUEST_HANDLE(uint64_t);
  ^

Adding #include  fixed the issue.

Did you really test your patches with the latest Linus tree?

My bad, it does build for ARM (which is my target), but also does
need "#include " for x86 which I didn't build this time.
Sorry about that.

Do you want me to resend this single patch or you can make the change
while applying?

Yes, it's fine.

Thank you

FWIW, the patches are in topic/xen branch in sound.git tree, and I'll
keep boiling for a while to see if any issues are caught by 0day bot.

... and now the topic/xen branch got merged to for-next, targeted for
4.18.

Thanks for your patient works!


Takashi

Great news, thank you,
Oleksandr


[lkp-robot] [tty] b6da31b2c0: WARNING:possible_circular_locking_dependency_detected

2018-05-21 Thread kernel test robot

FYI, we noticed the following commit (built with gcc-6):

commit: b6da31b2c07c46f2dcad1d86caa835227a16d9ff ("tty: Fix data race in 
tty_insert_flip_string_fixed_flag")
https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master

in testcase: boot

on test machine: qemu-system-x86_64 -enable-kvm -cpu Nehalem -smp 2 -m 512M

caused below changes (please refer to attached dmesg/kmsg for entire 
log/backtrace):


++++
|| 
638a6f4ebe | b6da31b2c0 |
++++
| boot_successes | 0
  | 0  |
| boot_failures  | 14   
  | 17 |
| WARNING:at_lib/debugobjects.c:#__debug_object_init | 14   
  | 17 |
| RIP:__debug_object_init| 14   
  | 17 |
| WARNING:suspicious_RCU_usage   | 14   
  | 17 |
| lib/test_rhashtable.c:#suspicious_rcu_dereference_protected()usage | 14   
  | 17 |
| WARNING:possible_circular_locking_dependency_detected  | 0
  | 15 |
++++



[   20.553647] WARNING: possible circular locking dependency detected
[   20.554329] 4.17.0-rc3-00051-gb6da31b #1 Tainted: GW
[   20.554980] --
[   20.555637] S36udev-cache/240 is trying to acquire lock:
[   20.556191] (ptrval) (_hash[i].lock){-.-.}, at: 
debug_object_activate+0x96/0x250
[   20.557071] 
[   20.557071] but task is already holding lock:
[   20.557702] (ptrval) (&(>lock)->rlock){-...}, at: 
pty_write+0x3a/0x90
[   20.558508] 
[   20.558508] which lock already depends on the new lock.
[   20.558508] 
[   20.559355] 
[   20.559355] the existing dependency chain (in reverse order) is:
[   20.560095] 
[   20.560095] -> #3 (&(>lock)->rlock){-...}:
[   20.560746]lock_acquire+0x1b3/0x20a
[   20.561249]_raw_spin_lock_irqsave+0x5b/0x70
[   20.561741]tty_port_tty_get+0x1b/0x50
[   20.562177]tty_port_default_wakeup+0xb/0x30
[   20.562689]serial8250_tx_chars+0x11e/0x1d0
[   20.563190]serial8250_handle_irq+0xa7/0xd0
[   20.563726]serial8250_default_handle_irq+0x39/0x50
[   20.564318]serial8250_interrupt+0x45/0x110
[   20.564826]__handle_irq_event_percpu+0x137/0x340
[   20.565388]handle_irq_event_percpu+0x30/0x70
[   20.565910]handle_irq_event+0x34/0x60
[   20.566374]handle_edge_irq+0x197/0x1c0
[   20.566840]handle_irq+0xfd/0x110
[   20.567290]do_IRQ+0x7b/0x100
[   20.567671]ret_from_intr+0x0/0x2f
[   20.568095]_raw_spin_unlock_irqrestore+0x60/0x70
[   20.568668]uart_write+0x143/0x1a0
[   20.569087]n_tty_write+0x250/0x490
[   20.569517]tty_write+0x1bf/0x2a0
[   20.569909]__vfs_write+0x33/0x140
[   20.570411]vfs_write+0xdf/0x1b0
[   20.570795]ksys_write+0x55/0xc0
[   20.571185]do_syscall_64+0x71/0x220
[   20.571656]entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   20.572248] 
[   20.572248] -> #2 (_lock_key){-...}:
[   20.572812]lock_acquire+0x1b3/0x20a
[   20.573261]_raw_spin_lock_irqsave+0x5b/0x70
[   20.573771]serial8250_console_write+0xfb/0x2a0
[   20.574319]console_unlock+0x434/0x6e0
[   20.574772]vprintk_emit+0x53d/0x560
[   20.575218]printk+0x52/0x6e
[   20.575614]register_console+0x30b/0x3c0
[   20.576073]univ8250_console_init+0x24/0x27
[   20.576568]console_init+0x20e/0x368
[   20.577009]start_kernel+0x44d/0x597
[   20.577454]secondary_startup_64+0xa5/0xb0
[   20.577940] 
[   20.577940] -> #1 (console_owner){-...}:
[   20.578517]lock_acquire+0x1b3/0x20a
[   20.578958]console_unlock+0x2e6/0x6e0
[   20.579439]vprintk_emit+0x53d/0x560
[   20.579858]printk+0x52/0x6e
[   20.580246]__debug_object_init+0x51d/0x580
[   20.580730]rhashtable_init+0x1fd/0x250
[   20.581180]rhltable_init+0x10/0x20
[   20.581609]test_insert_dup+0x58/0x718
[   20.582050]test_rht_init+0x2d9/0x5e1
[   20.582491]do_one_initcall+0x131/0x334
[   20.582961]kernel_init_freeable+0x324/0x3f4
[   20.583510]kernel_init+0xa/0x100
[   20.583942]ret_from_fork+0x3a/0x50
[   20.584400] 
[   20.584400] -> #0 (_hash[i].lock){-.-.}:
[   20.585056]__lock_acquire+0x5af/0x7b0
[   20.585568]lock_acquire+0x1b3/0x20a
[   20.586034]_raw_spin_lock_irqsave+0x5b/0x70
[   20.586575]

[lkp-robot] [tty] b6da31b2c0: WARNING:possible_circular_locking_dependency_detected

2018-05-21 Thread kernel test robot

FYI, we noticed the following commit (built with gcc-6):

commit: b6da31b2c07c46f2dcad1d86caa835227a16d9ff ("tty: Fix data race in 
tty_insert_flip_string_fixed_flag")
https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master

in testcase: boot

on test machine: qemu-system-x86_64 -enable-kvm -cpu Nehalem -smp 2 -m 512M

caused below changes (please refer to attached dmesg/kmsg for entire 
log/backtrace):


++++
|| 
638a6f4ebe | b6da31b2c0 |
++++
| boot_successes | 0
  | 0  |
| boot_failures  | 14   
  | 17 |
| WARNING:at_lib/debugobjects.c:#__debug_object_init | 14   
  | 17 |
| RIP:__debug_object_init| 14   
  | 17 |
| WARNING:suspicious_RCU_usage   | 14   
  | 17 |
| lib/test_rhashtable.c:#suspicious_rcu_dereference_protected()usage | 14   
  | 17 |
| WARNING:possible_circular_locking_dependency_detected  | 0
  | 15 |
++++



[   20.553647] WARNING: possible circular locking dependency detected
[   20.554329] 4.17.0-rc3-00051-gb6da31b #1 Tainted: GW
[   20.554980] --
[   20.555637] S36udev-cache/240 is trying to acquire lock:
[   20.556191] (ptrval) (_hash[i].lock){-.-.}, at: 
debug_object_activate+0x96/0x250
[   20.557071] 
[   20.557071] but task is already holding lock:
[   20.557702] (ptrval) (&(>lock)->rlock){-...}, at: 
pty_write+0x3a/0x90
[   20.558508] 
[   20.558508] which lock already depends on the new lock.
[   20.558508] 
[   20.559355] 
[   20.559355] the existing dependency chain (in reverse order) is:
[   20.560095] 
[   20.560095] -> #3 (&(>lock)->rlock){-...}:
[   20.560746]lock_acquire+0x1b3/0x20a
[   20.561249]_raw_spin_lock_irqsave+0x5b/0x70
[   20.561741]tty_port_tty_get+0x1b/0x50
[   20.562177]tty_port_default_wakeup+0xb/0x30
[   20.562689]serial8250_tx_chars+0x11e/0x1d0
[   20.563190]serial8250_handle_irq+0xa7/0xd0
[   20.563726]serial8250_default_handle_irq+0x39/0x50
[   20.564318]serial8250_interrupt+0x45/0x110
[   20.564826]__handle_irq_event_percpu+0x137/0x340
[   20.565388]handle_irq_event_percpu+0x30/0x70
[   20.565910]handle_irq_event+0x34/0x60
[   20.566374]handle_edge_irq+0x197/0x1c0
[   20.566840]handle_irq+0xfd/0x110
[   20.567290]do_IRQ+0x7b/0x100
[   20.567671]ret_from_intr+0x0/0x2f
[   20.568095]_raw_spin_unlock_irqrestore+0x60/0x70
[   20.568668]uart_write+0x143/0x1a0
[   20.569087]n_tty_write+0x250/0x490
[   20.569517]tty_write+0x1bf/0x2a0
[   20.569909]__vfs_write+0x33/0x140
[   20.570411]vfs_write+0xdf/0x1b0
[   20.570795]ksys_write+0x55/0xc0
[   20.571185]do_syscall_64+0x71/0x220
[   20.571656]entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   20.572248] 
[   20.572248] -> #2 (_lock_key){-...}:
[   20.572812]lock_acquire+0x1b3/0x20a
[   20.573261]_raw_spin_lock_irqsave+0x5b/0x70
[   20.573771]serial8250_console_write+0xfb/0x2a0
[   20.574319]console_unlock+0x434/0x6e0
[   20.574772]vprintk_emit+0x53d/0x560
[   20.575218]printk+0x52/0x6e
[   20.575614]register_console+0x30b/0x3c0
[   20.576073]univ8250_console_init+0x24/0x27
[   20.576568]console_init+0x20e/0x368
[   20.577009]start_kernel+0x44d/0x597
[   20.577454]secondary_startup_64+0xa5/0xb0
[   20.577940] 
[   20.577940] -> #1 (console_owner){-...}:
[   20.578517]lock_acquire+0x1b3/0x20a
[   20.578958]console_unlock+0x2e6/0x6e0
[   20.579439]vprintk_emit+0x53d/0x560
[   20.579858]printk+0x52/0x6e
[   20.580246]__debug_object_init+0x51d/0x580
[   20.580730]rhashtable_init+0x1fd/0x250
[   20.581180]rhltable_init+0x10/0x20
[   20.581609]test_insert_dup+0x58/0x718
[   20.582050]test_rht_init+0x2d9/0x5e1
[   20.582491]do_one_initcall+0x131/0x334
[   20.582961]kernel_init_freeable+0x324/0x3f4
[   20.583510]kernel_init+0xa/0x100
[   20.583942]ret_from_fork+0x3a/0x50
[   20.584400] 
[   20.584400] -> #0 (_hash[i].lock){-.-.}:
[   20.585056]__lock_acquire+0x5af/0x7b0
[   20.585568]lock_acquire+0x1b3/0x20a
[   20.586034]_raw_spin_lock_irqsave+0x5b/0x70
[   20.586575]

Re: [PATCH net-next 1/7] net: dsa: qca8k: Add QCA8334 binding documentation

2018-05-21 Thread Michal Vokáč

On 21.5.2018 16:47, Andrew Lunn wrote:

On Mon, May 21, 2018 at 03:28:07PM +0200, Michal Vokáč wrote:

Signed-off-by: Michal Vokáč 


Hi Michal

It would be good to document that fixed-link can be used.


OK, I will document that and add a commit message as well as you
mentioned before.

Thanks,
Michal


Re: [PATCH net-next 1/7] net: dsa: qca8k: Add QCA8334 binding documentation

2018-05-21 Thread Michal Vokáč

On 21.5.2018 16:47, Andrew Lunn wrote:

On Mon, May 21, 2018 at 03:28:07PM +0200, Michal Vokáč wrote:

Signed-off-by: Michal Vokáč 


Hi Michal

It would be good to document that fixed-link can be used.


OK, I will document that and add a commit message as well as you
mentioned before.

Thanks,
Michal


set_fs(KERNEL_DS) vs iovec

2018-05-21 Thread Benjamin Herrenschmidt
Hi guys !

I was helping with a small driver when I stumbled upon a comment from a
reviwer pointing to an old lwn article talking about deprecating set_fs
due to security concerns:

https://lwn.net/Articles/722267/

Now, this is a very simple driver running on a small/slow ARM SoC,
which reads from a FIFO and writes into a destination buffer.

It provides 2 interfaces, a userspace one (read syscall) and an in-
kernel one since some other kernel drivers use it as a transport.

My existing implementation uses the good old construct of doing
put_user() in the low level FIFO pumping code, and have the in-kernel
API do:

 old_fs = get_fs();
 set_fs(KERNEL_DS);
 rc = __sbefifo_submit(sbefifo, command, cmd_len,
   response, resp_len);
 set_fs(old_fs);

Which is simple, turns into fairly efficient code on that simple
device, and doesn't seem to have security issues...

That said, following the advice in that article, I tried to look at the
iovec stuff and noticed:

 - The APIs are almost entirely undocumented (or did I look in the
wrong place ?)

 - The code in lib/iov_iter.c is rather ... unreadable

 - It's also significantly more complex, thus would probably result in
a slower driver (remember: small SoC). It's quite overkill for my
simple use case.

 - There are very few users, set_fs(KERNEL_DS) is still the most common
method used by drivers.

Hence my question: Is is still acceptable these days to use
set_fs(KERNEL_DS) for simple cases like this ? Or is it really
deprecated and all new users should use the iovec's ?

Cheers,
Ben.




set_fs(KERNEL_DS) vs iovec

2018-05-21 Thread Benjamin Herrenschmidt
Hi guys !

I was helping with a small driver when I stumbled upon a comment from a
reviwer pointing to an old lwn article talking about deprecating set_fs
due to security concerns:

https://lwn.net/Articles/722267/

Now, this is a very simple driver running on a small/slow ARM SoC,
which reads from a FIFO and writes into a destination buffer.

It provides 2 interfaces, a userspace one (read syscall) and an in-
kernel one since some other kernel drivers use it as a transport.

My existing implementation uses the good old construct of doing
put_user() in the low level FIFO pumping code, and have the in-kernel
API do:

 old_fs = get_fs();
 set_fs(KERNEL_DS);
 rc = __sbefifo_submit(sbefifo, command, cmd_len,
   response, resp_len);
 set_fs(old_fs);

Which is simple, turns into fairly efficient code on that simple
device, and doesn't seem to have security issues...

That said, following the advice in that article, I tried to look at the
iovec stuff and noticed:

 - The APIs are almost entirely undocumented (or did I look in the
wrong place ?)

 - The code in lib/iov_iter.c is rather ... unreadable

 - It's also significantly more complex, thus would probably result in
a slower driver (remember: small SoC). It's quite overkill for my
simple use case.

 - There are very few users, set_fs(KERNEL_DS) is still the most common
method used by drivers.

Hence my question: Is is still acceptable these days to use
set_fs(KERNEL_DS) for simple cases like this ? Or is it really
deprecated and all new users should use the iovec's ?

Cheers,
Ben.




Re: [RFC PATCH] mtd: spi-nor: add support to non-uniform SPI NOR flash memories

2018-05-21 Thread Tudor Ambarus

Hi, Marek,

On 05/21/2018 07:59 PM, Marek Vasut wrote:

On 05/21/2018 06:42 PM, Tudor Ambarus wrote:

Hi, Marek,


[...]


This is a transitional patch: non-uniform erase maps will be used later
when initialized based on the SFDP data.


What about non-SFDP non-linear flashes ?


Non-SFDP non-uniform flashes support is not addressed with this
proposal, I should have told this in the commit message, thanks. But we
are backward compatible, if non-SFDP, the flashes are considered
uniform.


OK. btw wall-of-text description of patch isn't my fav thing.


Signed-off-by: Cyrille Pitchen 

[tudor.amba...@microchip.com:
- add improvements on how the erase map is handled. The map is an array
describing the boundaries of the erase regions. LSB bits of the region's
offset are used to describe the supported erase types, to indicate if
that specific region is the last region in the map and to mark if the
region is overlaid or not. When one sends an addr and len to erase a
chunk of memory, we identify in which region the address fits, we start
erasing with the best fitted erase commands and when the region ends,
continue to erase from the next region. The erase is optimal: identify
the start offset (once), then erase with the best erase command,
move forward and repeat.


Is that like an R-tree ?


Not really. I find this RFC proposal faster and neat, but I'm open for
suggestions and guidance.

One wants to erase a contiguous chunk of memory and sends us the
starting address and the total length. The algorithm of finding the best
sequence of erase commands can be summarized in four steps:

1. Find in which region the address fits.
This step is done only once, at the beginning. For the non-uniform
SFDP-defined flashes, usually there are two or three regions defined.
Nevertheless, in the worst case, the maximum number of regions that can
be defined is on eight bits, so 255. Linear search for just 255 elements
in the worst case looks good for me, especially that we do this search
once.

2. Find the *best* erase command that is defined in that region.
Each region can define maximum 4 erase commands. *Best* is defined as
the largest/biggest supported erase command with which the provided
address is aligned and which does not erase more that what the user has
asked for. In case of overlaid regions, alignment does not matter. The
largest command will erase the remaining of the overlaid region without
touching the region with which it overlaps (see S25FS512S). The
supported erase commands are ordered by size with the biggest queried
first. It is desirable to erase with large erase commands so that we
erase as much as we can in one shoot, minimizing the erase() calls.

3. Erase sector with the *best* erase command and move forward in a
linear fashion.
addr += cmd->size;
len -= cmd->size;
If the new address exceeds the end of this region, move to the next.

4. While (len) goto step2.

That's all. Linearity is an advantage. We find the starting region and
then we traverse each region in order without other queries.




- order erase types by size, with the biggest erase type at BIT(0). With
this, we can iterate from the biggest supported erase type to the
smallest,
and when find one that meets all the required conditions, break the
loop.
This saves time in determining the best erase cmd.

- minimize the amount of erase() calls by using the best sequence of
erase
type commands depending on alignment.


Nice, this was long overdue


- replace spi_nor_find_uniform_erase() with
spi_nor_select_uniform_erase().
Even for the SPI NOR memories with non-uniform erase types, we can
determine
at init if there are erase types that can erase the entire memory.
Fill at
init the uniform_erase_type bitmask, to encode the erase type
commands that
can erase the entire memory.

- clarify support for overlaid regions. Considering one of the erase
maps
of the S25FS512S memory:
Bottom: 8x 4KB sectors at bottom (only 4KB erase supported),
  1x overlaid 224KB sector at bottom (only 256KB erase
supported),
  255x 256KB sectors (only 256KB erase supported)
S25FS512S states that 'if a sector erase command is applied to a
256KB range
that is overlaid by 4KB secors, the overlaid 4kB sectors are not
affected by
the erase'. When at init, the overlaid region size should be set to
region->size = erase_size - count; in order to not miss chunks of data
when traversing the regions.

- backward compatibility test done on MX25L25673G.

The 'erase with the best command, move forward and repeat' approach was
suggested by Cristian Birsan in a brainstorm session, so:
]
Suggested-by: Cristian Birsan 
Signed-off-by: Tudor Ambarus 
---
   drivers/mtd/spi-nor/spi-nor.c | 281
+++---
   include/linux/mtd/spi-nor.h   |  89 +
   2 files changed, 356 insertions(+), 14 deletions(-)

diff --git a/drivers/mtd/spi-nor/spi-nor.c

Re: [RFC PATCH] mtd: spi-nor: add support to non-uniform SPI NOR flash memories

2018-05-21 Thread Tudor Ambarus

Hi, Marek,

On 05/21/2018 07:59 PM, Marek Vasut wrote:

On 05/21/2018 06:42 PM, Tudor Ambarus wrote:

Hi, Marek,


[...]


This is a transitional patch: non-uniform erase maps will be used later
when initialized based on the SFDP data.


What about non-SFDP non-linear flashes ?


Non-SFDP non-uniform flashes support is not addressed with this
proposal, I should have told this in the commit message, thanks. But we
are backward compatible, if non-SFDP, the flashes are considered
uniform.


OK. btw wall-of-text description of patch isn't my fav thing.


Signed-off-by: Cyrille Pitchen 

[tudor.amba...@microchip.com:
- add improvements on how the erase map is handled. The map is an array
describing the boundaries of the erase regions. LSB bits of the region's
offset are used to describe the supported erase types, to indicate if
that specific region is the last region in the map and to mark if the
region is overlaid or not. When one sends an addr and len to erase a
chunk of memory, we identify in which region the address fits, we start
erasing with the best fitted erase commands and when the region ends,
continue to erase from the next region. The erase is optimal: identify
the start offset (once), then erase with the best erase command,
move forward and repeat.


Is that like an R-tree ?


Not really. I find this RFC proposal faster and neat, but I'm open for
suggestions and guidance.

One wants to erase a contiguous chunk of memory and sends us the
starting address and the total length. The algorithm of finding the best
sequence of erase commands can be summarized in four steps:

1. Find in which region the address fits.
This step is done only once, at the beginning. For the non-uniform
SFDP-defined flashes, usually there are two or three regions defined.
Nevertheless, in the worst case, the maximum number of regions that can
be defined is on eight bits, so 255. Linear search for just 255 elements
in the worst case looks good for me, especially that we do this search
once.

2. Find the *best* erase command that is defined in that region.
Each region can define maximum 4 erase commands. *Best* is defined as
the largest/biggest supported erase command with which the provided
address is aligned and which does not erase more that what the user has
asked for. In case of overlaid regions, alignment does not matter. The
largest command will erase the remaining of the overlaid region without
touching the region with which it overlaps (see S25FS512S). The
supported erase commands are ordered by size with the biggest queried
first. It is desirable to erase with large erase commands so that we
erase as much as we can in one shoot, minimizing the erase() calls.

3. Erase sector with the *best* erase command and move forward in a
linear fashion.
addr += cmd->size;
len -= cmd->size;
If the new address exceeds the end of this region, move to the next.

4. While (len) goto step2.

That's all. Linearity is an advantage. We find the starting region and
then we traverse each region in order without other queries.




- order erase types by size, with the biggest erase type at BIT(0). With
this, we can iterate from the biggest supported erase type to the
smallest,
and when find one that meets all the required conditions, break the
loop.
This saves time in determining the best erase cmd.

- minimize the amount of erase() calls by using the best sequence of
erase
type commands depending on alignment.


Nice, this was long overdue


- replace spi_nor_find_uniform_erase() with
spi_nor_select_uniform_erase().
Even for the SPI NOR memories with non-uniform erase types, we can
determine
at init if there are erase types that can erase the entire memory.
Fill at
init the uniform_erase_type bitmask, to encode the erase type
commands that
can erase the entire memory.

- clarify support for overlaid regions. Considering one of the erase
maps
of the S25FS512S memory:
Bottom: 8x 4KB sectors at bottom (only 4KB erase supported),
  1x overlaid 224KB sector at bottom (only 256KB erase
supported),
  255x 256KB sectors (only 256KB erase supported)
S25FS512S states that 'if a sector erase command is applied to a
256KB range
that is overlaid by 4KB secors, the overlaid 4kB sectors are not
affected by
the erase'. When at init, the overlaid region size should be set to
region->size = erase_size - count; in order to not miss chunks of data
when traversing the regions.

- backward compatibility test done on MX25L25673G.

The 'erase with the best command, move forward and repeat' approach was
suggested by Cristian Birsan in a brainstorm session, so:
]
Suggested-by: Cristian Birsan 
Signed-off-by: Tudor Ambarus 
---
   drivers/mtd/spi-nor/spi-nor.c | 281
+++---
   include/linux/mtd/spi-nor.h   |  89 +
   2 files changed, 356 insertions(+), 14 deletions(-)

diff --git a/drivers/mtd/spi-nor/spi-nor.c
b/drivers/mtd/spi-nor/spi-nor.c
index 494b7a2..bb70664 100644
--- 

Re: [PATCH v4 07/31] kconfig: add built-in function support

2018-05-21 Thread Ulf Magnusson
On Tue, May 22, 2018 at 6:50 AM, Ulf Magnusson  wrote:
> On Tue, May 22, 2018 at 5:11 AM, Masahiro Yamada
>  wrote:
>> 2018-05-22 0:10 GMT+09:00 Ulf Magnusson :
>>> On Mon, May 21, 2018 at 4:32 PM, Ulf Magnusson  wrote:
 On Mon, May 21, 2018 at 4:23 PM, Ulf Magnusson  wrote:
> On Sun, May 20, 2018 at 4:50 PM, Sam Ravnborg  wrote:
>> Will the following be equal:
>>
>> $(foo,abc,$(x),$(y))
>> $(foo, abc, $(x), $(y))
>>
>> make is rather annoying as space is significant, but there seems no good 
>> reason
>> for kconfig to inheritate this.
>> So unless there are good arguments consider alloing the spaces.
>> If the current implmentation already supports optional spaces then I 
>> just missed
>> it whie reviewing.
>>
>> Sam
>
> +1 from me.
>
> I also find the rules for whitespace in Make confusing, and always
> have to look them up when doing trickier stuff. Maybe they're the
> result of people not considering whitespace initially, and stuff
> getting tacked on later. GNU Make adds some alternate syntaxes with
> quotes.
>
> I was going to mention shell, but it looks like you already did. :)
>
> If we go with Make-like syntax, maybe we could at least have a variant
> with fewer whitespace gotchas.
>
> Cheers,
> Ulf

 Maybe it'd be a pain to implement, but something like $(foo $(x) "two
 words" "interpolated $(stuff)") seems pretty nice, with three
 arguments there.
>>>
>>> Guess that might interact poorly with $(shell foo "bar baz") though.
>>> Kinda nice to have a syntax that doesn't overlap with shell when
>>> building shell commands.
>>
>>
>> Right.  I can easily imagine
>> that would end up with more gotchas due to quoting and escaping.
>
> Yeah, you're right. It's probably trying to fix something that isn't
> broken. Make's syntax really isn't bad there, just slightly
> non-obvious at first...
>
> Think it's fine now. Better to commit to the syntax than trying to be
> "helpful" by adding a bunch of random exceptions too. That probably
> gives a bigger mess in the end...
>
> Cheers,
> Ulf

I'm fine with the comma-after-function-name change though. That just
makes it more consistent.

Cheers,
Ulf


Re: [PATCH v4 07/31] kconfig: add built-in function support

2018-05-21 Thread Ulf Magnusson
On Tue, May 22, 2018 at 6:50 AM, Ulf Magnusson  wrote:
> On Tue, May 22, 2018 at 5:11 AM, Masahiro Yamada
>  wrote:
>> 2018-05-22 0:10 GMT+09:00 Ulf Magnusson :
>>> On Mon, May 21, 2018 at 4:32 PM, Ulf Magnusson  wrote:
 On Mon, May 21, 2018 at 4:23 PM, Ulf Magnusson  wrote:
> On Sun, May 20, 2018 at 4:50 PM, Sam Ravnborg  wrote:
>> Will the following be equal:
>>
>> $(foo,abc,$(x),$(y))
>> $(foo, abc, $(x), $(y))
>>
>> make is rather annoying as space is significant, but there seems no good 
>> reason
>> for kconfig to inheritate this.
>> So unless there are good arguments consider alloing the spaces.
>> If the current implmentation already supports optional spaces then I 
>> just missed
>> it whie reviewing.
>>
>> Sam
>
> +1 from me.
>
> I also find the rules for whitespace in Make confusing, and always
> have to look them up when doing trickier stuff. Maybe they're the
> result of people not considering whitespace initially, and stuff
> getting tacked on later. GNU Make adds some alternate syntaxes with
> quotes.
>
> I was going to mention shell, but it looks like you already did. :)
>
> If we go with Make-like syntax, maybe we could at least have a variant
> with fewer whitespace gotchas.
>
> Cheers,
> Ulf

 Maybe it'd be a pain to implement, but something like $(foo $(x) "two
 words" "interpolated $(stuff)") seems pretty nice, with three
 arguments there.
>>>
>>> Guess that might interact poorly with $(shell foo "bar baz") though.
>>> Kinda nice to have a syntax that doesn't overlap with shell when
>>> building shell commands.
>>
>>
>> Right.  I can easily imagine
>> that would end up with more gotchas due to quoting and escaping.
>
> Yeah, you're right. It's probably trying to fix something that isn't
> broken. Make's syntax really isn't bad there, just slightly
> non-obvious at first...
>
> Think it's fine now. Better to commit to the syntax than trying to be
> "helpful" by adding a bunch of random exceptions too. That probably
> gives a bigger mess in the end...
>
> Cheers,
> Ulf

I'm fine with the comma-after-function-name change though. That just
makes it more consistent.

Cheers,
Ulf


Re: Tasks RCU vs Preempt RCU

2018-05-21 Thread Joel Fernandes
(Resending because my previous mail client terribly wrapped things..)

On Mon, May 21, 2018 at 09:59:51PM -0400, Steven Rostedt wrote:
[...] 
> > 
> > Just thinking out loud and probably some food for thought..
> > 
> > The rcu_read_lock/unlock primitive are extrememly fast, so I don't 
> > personally
> > think there's a time hit.
> > 
> > Could we get around the trampoline code == data issue by say using a
> > multi-stage trampoline like so? :
> > 
> > call func_tramp --> (static
> > trampoline)   (dynamic trampoline)
> > rcu_read_lock()  ---> set up stack
> >   call function_tracer()
> >   pop stack
> > rcu_read_unlock() <-- ret
> >  
> > I know there's probably more to it than this, but conceptually atleast, it
> 
> Yes, there is more to it. Think about why we create a dynamic
> trampoline. It is to make a custom call per callback for a group of
> functions being traced by that callback.
> 
> Now, if we make that static trampoline, we just lost the reason for the
> dynamic one. How would that work if you have 5 different users of the
> callbacks (and lets not forget about optimized kprobes)? How do you
> jump from the static trampoline to the dynamic one with a single call?
> 
> > feels like all the RCU infrastructure is already there to handle preemption
> > within a trampoline and it would be cool if the trampoline were as shown
> > above for the dynamically allocated trampolines. Atleast I feel it will be
> > faster than the pre-trampoline code that did the hash lookups / matching to
> > call the right function callbacks, and could help eliminiate need for the
> > RCU-tasks subsystem and its kthread then.
> 
> I don't see how the static trampoline would be able to call. Do we
> create a static trampoline for every function that is traced and never
> delete it? That's a lot of memory.

Yeah, ok. I agree that was a dumb idea. :) I see it defeats the point.

> Also, we trace rcu_read_lock/unlock(), and I use that for a lot of
> debugging. And we also need to deal with tracing code that RCU does not
> watch, because function tracing does a lot of that too. I finally gave
> up trying to have the stack tracer trace those locations, because it
> was a serious game of whack a mole that would never end. I don't want
> to give up full function tracing for the same reason.

Yes, I understand. Its good to not have it depend on too many things which
may limit its utility.

> > If you still feel its nots worth it, then that's okay too and clearly the
> > RCU-tasks has benefits such as a simpler trampoline implementation..
> 
> If you are worried about making RCU simpler, we can go to my original
> thought which was to make a home grown RCU like system that we can use,
> as this has different requirements than normal RCU has. Like we don't
> need a "lock" at all. We just need guaranteed quiescent points that we
> make sure all tasks would go through before freeing the trampolines.
> But it was decided to create a new flavor of RCU instead of doing that.

Yes, lets brain storm this if you like. One way I was thinking if we can
manually check every CPU and see what state its in (usermode, kernel, idle
etc) using an IPI mechanism. Once all CPUs have been seen to be in usermode,
or idle atleast once - then we are done. You have probably already thought
about this so feel free to say why its not a good idea, but to me there are 3
places that a tasks quiescent state is recorded: during the timer tick,
during task sleep and during rcu_note_voluntary_context_switch in
cond_resched_rcu_qs. Of these, I feel only the cond_resched_rcu_qs case isn't
trackable with IPI mechanism which may make the detection a bit slower, but
tasks-RCU in mainline is slow right now anyway (~ 1 second delay if any task
was held).

thanks,

 - Joel



Re: Tasks RCU vs Preempt RCU

2018-05-21 Thread Joel Fernandes
(Resending because my previous mail client terribly wrapped things..)

On Mon, May 21, 2018 at 09:59:51PM -0400, Steven Rostedt wrote:
[...] 
> > 
> > Just thinking out loud and probably some food for thought..
> > 
> > The rcu_read_lock/unlock primitive are extrememly fast, so I don't 
> > personally
> > think there's a time hit.
> > 
> > Could we get around the trampoline code == data issue by say using a
> > multi-stage trampoline like so? :
> > 
> > call func_tramp --> (static
> > trampoline)   (dynamic trampoline)
> > rcu_read_lock()  ---> set up stack
> >   call function_tracer()
> >   pop stack
> > rcu_read_unlock() <-- ret
> >  
> > I know there's probably more to it than this, but conceptually atleast, it
> 
> Yes, there is more to it. Think about why we create a dynamic
> trampoline. It is to make a custom call per callback for a group of
> functions being traced by that callback.
> 
> Now, if we make that static trampoline, we just lost the reason for the
> dynamic one. How would that work if you have 5 different users of the
> callbacks (and lets not forget about optimized kprobes)? How do you
> jump from the static trampoline to the dynamic one with a single call?
> 
> > feels like all the RCU infrastructure is already there to handle preemption
> > within a trampoline and it would be cool if the trampoline were as shown
> > above for the dynamically allocated trampolines. Atleast I feel it will be
> > faster than the pre-trampoline code that did the hash lookups / matching to
> > call the right function callbacks, and could help eliminiate need for the
> > RCU-tasks subsystem and its kthread then.
> 
> I don't see how the static trampoline would be able to call. Do we
> create a static trampoline for every function that is traced and never
> delete it? That's a lot of memory.

Yeah, ok. I agree that was a dumb idea. :) I see it defeats the point.

> Also, we trace rcu_read_lock/unlock(), and I use that for a lot of
> debugging. And we also need to deal with tracing code that RCU does not
> watch, because function tracing does a lot of that too. I finally gave
> up trying to have the stack tracer trace those locations, because it
> was a serious game of whack a mole that would never end. I don't want
> to give up full function tracing for the same reason.

Yes, I understand. Its good to not have it depend on too many things which
may limit its utility.

> > If you still feel its nots worth it, then that's okay too and clearly the
> > RCU-tasks has benefits such as a simpler trampoline implementation..
> 
> If you are worried about making RCU simpler, we can go to my original
> thought which was to make a home grown RCU like system that we can use,
> as this has different requirements than normal RCU has. Like we don't
> need a "lock" at all. We just need guaranteed quiescent points that we
> make sure all tasks would go through before freeing the trampolines.
> But it was decided to create a new flavor of RCU instead of doing that.

Yes, lets brain storm this if you like. One way I was thinking if we can
manually check every CPU and see what state its in (usermode, kernel, idle
etc) using an IPI mechanism. Once all CPUs have been seen to be in usermode,
or idle atleast once - then we are done. You have probably already thought
about this so feel free to say why its not a good idea, but to me there are 3
places that a tasks quiescent state is recorded: during the timer tick,
during task sleep and during rcu_note_voluntary_context_switch in
cond_resched_rcu_qs. Of these, I feel only the cond_resched_rcu_qs case isn't
trackable with IPI mechanism which may make the detection a bit slower, but
tasks-RCU in mainline is slow right now anyway (~ 1 second delay if any task
was held).

thanks,

 - Joel



Re: [PATCH v4 07/31] kconfig: add built-in function support

2018-05-21 Thread Ulf Magnusson
On Tue, May 22, 2018 at 5:11 AM, Masahiro Yamada
 wrote:
> 2018-05-22 0:10 GMT+09:00 Ulf Magnusson :
>> On Mon, May 21, 2018 at 4:32 PM, Ulf Magnusson  wrote:
>>> On Mon, May 21, 2018 at 4:23 PM, Ulf Magnusson  wrote:
 On Sun, May 20, 2018 at 4:50 PM, Sam Ravnborg  wrote:
> Will the following be equal:
>
> $(foo,abc,$(x),$(y))
> $(foo, abc, $(x), $(y))
>
> make is rather annoying as space is significant, but there seems no good 
> reason
> for kconfig to inheritate this.
> So unless there are good arguments consider alloing the spaces.
> If the current implmentation already supports optional spaces then I just 
> missed
> it whie reviewing.
>
> Sam

 +1 from me.

 I also find the rules for whitespace in Make confusing, and always
 have to look them up when doing trickier stuff. Maybe they're the
 result of people not considering whitespace initially, and stuff
 getting tacked on later. GNU Make adds some alternate syntaxes with
 quotes.

 I was going to mention shell, but it looks like you already did. :)

 If we go with Make-like syntax, maybe we could at least have a variant
 with fewer whitespace gotchas.

 Cheers,
 Ulf
>>>
>>> Maybe it'd be a pain to implement, but something like $(foo $(x) "two
>>> words" "interpolated $(stuff)") seems pretty nice, with three
>>> arguments there.
>>
>> Guess that might interact poorly with $(shell foo "bar baz") though.
>> Kinda nice to have a syntax that doesn't overlap with shell when
>> building shell commands.
>
>
> Right.  I can easily imagine
> that would end up with more gotchas due to quoting and escaping.

Yeah, you're right. It's probably trying to fix something that isn't
broken. Make's syntax really isn't bad there, just slightly
non-obvious at first...

Think it's fine now. Better to commit to the syntax than trying to be
"helpful" by adding a bunch of random exceptions too. That probably
gives a bigger mess in the end...

Cheers,
Ulf


Re: [PATCH v4 07/31] kconfig: add built-in function support

2018-05-21 Thread Ulf Magnusson
On Tue, May 22, 2018 at 5:11 AM, Masahiro Yamada
 wrote:
> 2018-05-22 0:10 GMT+09:00 Ulf Magnusson :
>> On Mon, May 21, 2018 at 4:32 PM, Ulf Magnusson  wrote:
>>> On Mon, May 21, 2018 at 4:23 PM, Ulf Magnusson  wrote:
 On Sun, May 20, 2018 at 4:50 PM, Sam Ravnborg  wrote:
> Will the following be equal:
>
> $(foo,abc,$(x),$(y))
> $(foo, abc, $(x), $(y))
>
> make is rather annoying as space is significant, but there seems no good 
> reason
> for kconfig to inheritate this.
> So unless there are good arguments consider alloing the spaces.
> If the current implmentation already supports optional spaces then I just 
> missed
> it whie reviewing.
>
> Sam

 +1 from me.

 I also find the rules for whitespace in Make confusing, and always
 have to look them up when doing trickier stuff. Maybe they're the
 result of people not considering whitespace initially, and stuff
 getting tacked on later. GNU Make adds some alternate syntaxes with
 quotes.

 I was going to mention shell, but it looks like you already did. :)

 If we go with Make-like syntax, maybe we could at least have a variant
 with fewer whitespace gotchas.

 Cheers,
 Ulf
>>>
>>> Maybe it'd be a pain to implement, but something like $(foo $(x) "two
>>> words" "interpolated $(stuff)") seems pretty nice, with three
>>> arguments there.
>>
>> Guess that might interact poorly with $(shell foo "bar baz") though.
>> Kinda nice to have a syntax that doesn't overlap with shell when
>> building shell commands.
>
>
> Right.  I can easily imagine
> that would end up with more gotchas due to quoting and escaping.

Yeah, you're right. It's probably trying to fix something that isn't
broken. Make's syntax really isn't bad there, just slightly
non-obvious at first...

Think it's fine now. Better to commit to the syntax than trying to be
"helpful" by adding a bunch of random exceptions too. That probably
gives a bigger mess in the end...

Cheers,
Ulf


Re: [PATCH v3 4/4] rcu: Unlock non-start node only after accessing its gp_seq_needed

2018-05-21 Thread Joel Fernandes
On Mon, May 21, 2018 at 09:16:51PM -0700, Paul E. McKenney wrote:
> On Mon, May 21, 2018 at 05:28:23PM -0700, Paul E. McKenney wrote:
> > On Mon, May 21, 2018 at 05:07:34PM -0700, Joel Fernandes wrote:
> > > On Mon, May 21, 2018 at 04:25:38PM -0700, Paul E. McKenney wrote:
> > > > On Sun, May 20, 2018 at 09:42:20PM -0700, Joel Fernandes wrote:
> > > > > We acquire gp_seq_needed locklessly. To be safe, lets do the unlocking
> > > > > after the access.
> > > > 
> > > > Actually, no, we hold rnp_start's ->lock throughout.  And this CPU (or 
> > > > in
> > > > the case of no-CBs CPUs, this task) is in charge of rdp->gp_seq_needed,
> > > > so nothing else is accessing it.  Or at least that is the intent.  ;-)
> > > 
> > > I was talking about protecting the internal node's rnp->gp_seq_needed, not
> > > the rnp_start's gp_seq_needed.
> > 
> > Ah, good point, I missed the "if" condition.  This can be argued to work,
> > sort of, given that we still hold the leaf rcu_node structure's lock,
> > so that there is a limit to how far grace periods can advance.
> > 
> > But the code would of course be much cleaner with your change.
> > 
> > > We are protecting them in the loop:
> > > 
> > > like this:
> > > for(...)
> > >   if (rnp != rnp_start)
> > >   raw_spin_lock_rcu_node(rnp);
> > >   [...]
> > >   // access rnp->gp_seq and rnp->gp_seq_needed
> > >   [...]
> > >   if (rnp != rnp_start)
> > >   raw_spin_unlock_rcu_node(rnp);
> > > 
> > > But we don't need to do such protection in unlock_out ? I'm sorry if I'm
> > > missing something, but I'm wondering if rnp->gp_seq_needed of an internal
> > > node can be accessed locklessly, then why can't that be done also in the
> > > funnel locking loop - after all we are holding the rnp_start's lock 
> > > through
> > > out right?
> > 
> > I was focused on the updates, and missed the rnp->gp_seq_req access in the
> > "if" statement.  The current code does sort of work, but only assuming
> > that the compiler doesn't tear the load, and so your change would help.
> > Could you please resend with your other two updated patches?  It depends
> > on one of the earlier patches, so does not apply cleanly as-is.  I could
> > hand-apply it, but that sounds like a good way to make your updated
> > series fail to apply.  ;-)
> > 
> > But could you also make the commit log explicitly call out the "if"
> > condition as being the offending access?
> 
> Never mind, me being stupid.  I need to apply this change to the original
> commit "rcu: Make rcu_nocb_wait_gp() check if GP already requested", which
> I have done with this attribution:
> 
> [ paulmck: Move lock release past "if" as suggested by Joel Fernandes. ]
> 
> I have rebased my stack on top of the updated commit.

Cool, makes sense. I am assuming this means I don't have to resend this
patch, if I do let me know :)

Either way, once you push your updated tree to kernel.org, I'll double check
to make sure the change is in :)

thanks, good night,

 - Joel



Re: [PATCH v3 4/4] rcu: Unlock non-start node only after accessing its gp_seq_needed

2018-05-21 Thread Joel Fernandes
On Mon, May 21, 2018 at 09:16:51PM -0700, Paul E. McKenney wrote:
> On Mon, May 21, 2018 at 05:28:23PM -0700, Paul E. McKenney wrote:
> > On Mon, May 21, 2018 at 05:07:34PM -0700, Joel Fernandes wrote:
> > > On Mon, May 21, 2018 at 04:25:38PM -0700, Paul E. McKenney wrote:
> > > > On Sun, May 20, 2018 at 09:42:20PM -0700, Joel Fernandes wrote:
> > > > > We acquire gp_seq_needed locklessly. To be safe, lets do the unlocking
> > > > > after the access.
> > > > 
> > > > Actually, no, we hold rnp_start's ->lock throughout.  And this CPU (or 
> > > > in
> > > > the case of no-CBs CPUs, this task) is in charge of rdp->gp_seq_needed,
> > > > so nothing else is accessing it.  Or at least that is the intent.  ;-)
> > > 
> > > I was talking about protecting the internal node's rnp->gp_seq_needed, not
> > > the rnp_start's gp_seq_needed.
> > 
> > Ah, good point, I missed the "if" condition.  This can be argued to work,
> > sort of, given that we still hold the leaf rcu_node structure's lock,
> > so that there is a limit to how far grace periods can advance.
> > 
> > But the code would of course be much cleaner with your change.
> > 
> > > We are protecting them in the loop:
> > > 
> > > like this:
> > > for(...)
> > >   if (rnp != rnp_start)
> > >   raw_spin_lock_rcu_node(rnp);
> > >   [...]
> > >   // access rnp->gp_seq and rnp->gp_seq_needed
> > >   [...]
> > >   if (rnp != rnp_start)
> > >   raw_spin_unlock_rcu_node(rnp);
> > > 
> > > But we don't need to do such protection in unlock_out ? I'm sorry if I'm
> > > missing something, but I'm wondering if rnp->gp_seq_needed of an internal
> > > node can be accessed locklessly, then why can't that be done also in the
> > > funnel locking loop - after all we are holding the rnp_start's lock 
> > > through
> > > out right?
> > 
> > I was focused on the updates, and missed the rnp->gp_seq_req access in the
> > "if" statement.  The current code does sort of work, but only assuming
> > that the compiler doesn't tear the load, and so your change would help.
> > Could you please resend with your other two updated patches?  It depends
> > on one of the earlier patches, so does not apply cleanly as-is.  I could
> > hand-apply it, but that sounds like a good way to make your updated
> > series fail to apply.  ;-)
> > 
> > But could you also make the commit log explicitly call out the "if"
> > condition as being the offending access?
> 
> Never mind, me being stupid.  I need to apply this change to the original
> commit "rcu: Make rcu_nocb_wait_gp() check if GP already requested", which
> I have done with this attribution:
> 
> [ paulmck: Move lock release past "if" as suggested by Joel Fernandes. ]
> 
> I have rebased my stack on top of the updated commit.

Cool, makes sense. I am assuming this means I don't have to resend this
patch, if I do let me know :)

Either way, once you push your updated tree to kernel.org, I'll double check
to make sure the change is in :)

thanks, good night,

 - Joel



Re: Tasks RCU vs Preempt RCU

2018-05-21 Thread Joel Fernandes
On Mon, May 21, 2018 at 6:59 PM Steven Rostedt  wrote:
[...]
> >
> > Just thinking out loud and probably some food for thought..
> >
> > The rcu_read_lock/unlock primitive are extrememly fast, so I don't
personally
> > think there's a time hit.
> >
> > Could we get around the trampoline code == data issue by say using a
> > multi-stage trampoline like so? :
> >
> >   call func_tramp --> (static
> >   trampoline)   (dynamic trampoline)
> >   rcu_read_lock()  ---> set up stack
> > call
function_tracer()
> > pop stack
> > rcu_read_unlock() <-- ret
> >
> > I know there's probably more to it than this, but conceptually atleast,
it

> Yes, there is more to it. Think about why we create a dynamic
> trampoline. It is to make a custom call per callback for a group of
> functions being traced by that callback.

> Now, if we make that static trampoline, we just lost the reason for the
> dynamic one. How would that work if you have 5 different users of the
> callbacks (and lets not forget about optimized kprobes)? How do you
> jump from the static trampoline to the dynamic one with a single call?

> > feels like all the RCU infrastructure is already there to handle
preemption
> > within a trampoline and it would be cool if the trampoline were as shown
> > above for the dynamically allocated trampolines. Atleast I feel it will
be
> > faster than the pre-trampoline code that did the hash lookups /
matching to
> > call the right function callbacks, and could help eliminiate need for
the
> > RCU-tasks subsystem and its kthread then.

> I don't see how the static trampoline would be able to call. Do we
> create a static trampoline for every function that is traced and never
> delete it? That's a lot of memory.

Yeah, ok. That was a dumb idea. :) I see it defeats the point.

> Also, we trace rcu_read_lock/unlock(), and I use that for a lot of
> debugging. And we also need to deal with tracing code that RCU does not
> watch, because function tracing does a lot of that too. I finally gave
> up trying to have the stack tracer trace those locations, because it
> was a serious game of whack a mole that would never end. I don't want
> to give up full function tracing for the same reason.

Yes, I understand. Its good to not have it depend on too many things which
may limit its utility.

> > If you still feel its nots worth it, then that's okay too and clearly
the
> > RCU-tasks has benefits such as a simpler trampoline implementation..

> If you are worried about making RCU simpler, we can go to my original
> thought which was to make a home grown RCU like system that we can use,
> as this has different requirements than normal RCU has. Like we don't
> need a "lock" at all. We just need guaranteed quiescent points that we
> make sure all tasks would go through before freeing the trampolines.
> But it was decided to create a new flavor of RCU instead of doing that.

Yes, lets brain storm this if you like. One way I was thinking if we can
manually check every CPU and see what state its in (usermode, kernel, idle
etc) using an IPI mechanism. Once all CPUs have been seen to be usermode,
or idle atleast once - then we are done. You have probably already thought
about this so feel free to say why its not a good idea, but to me there are
3 places that a tasks quiescent state is recorded. During the timer tick,
during task sleep and during cond_resched_tasks_qs.  Of these, I feel only
the cond_resched case isn't trackable with and IPI mechanism.

thanks,

- Joel


Re: Tasks RCU vs Preempt RCU

2018-05-21 Thread Joel Fernandes
On Mon, May 21, 2018 at 6:59 PM Steven Rostedt  wrote:
[...]
> >
> > Just thinking out loud and probably some food for thought..
> >
> > The rcu_read_lock/unlock primitive are extrememly fast, so I don't
personally
> > think there's a time hit.
> >
> > Could we get around the trampoline code == data issue by say using a
> > multi-stage trampoline like so? :
> >
> >   call func_tramp --> (static
> >   trampoline)   (dynamic trampoline)
> >   rcu_read_lock()  ---> set up stack
> > call
function_tracer()
> > pop stack
> > rcu_read_unlock() <-- ret
> >
> > I know there's probably more to it than this, but conceptually atleast,
it

> Yes, there is more to it. Think about why we create a dynamic
> trampoline. It is to make a custom call per callback for a group of
> functions being traced by that callback.

> Now, if we make that static trampoline, we just lost the reason for the
> dynamic one. How would that work if you have 5 different users of the
> callbacks (and lets not forget about optimized kprobes)? How do you
> jump from the static trampoline to the dynamic one with a single call?

> > feels like all the RCU infrastructure is already there to handle
preemption
> > within a trampoline and it would be cool if the trampoline were as shown
> > above for the dynamically allocated trampolines. Atleast I feel it will
be
> > faster than the pre-trampoline code that did the hash lookups /
matching to
> > call the right function callbacks, and could help eliminiate need for
the
> > RCU-tasks subsystem and its kthread then.

> I don't see how the static trampoline would be able to call. Do we
> create a static trampoline for every function that is traced and never
> delete it? That's a lot of memory.

Yeah, ok. That was a dumb idea. :) I see it defeats the point.

> Also, we trace rcu_read_lock/unlock(), and I use that for a lot of
> debugging. And we also need to deal with tracing code that RCU does not
> watch, because function tracing does a lot of that too. I finally gave
> up trying to have the stack tracer trace those locations, because it
> was a serious game of whack a mole that would never end. I don't want
> to give up full function tracing for the same reason.

Yes, I understand. Its good to not have it depend on too many things which
may limit its utility.

> > If you still feel its nots worth it, then that's okay too and clearly
the
> > RCU-tasks has benefits such as a simpler trampoline implementation..

> If you are worried about making RCU simpler, we can go to my original
> thought which was to make a home grown RCU like system that we can use,
> as this has different requirements than normal RCU has. Like we don't
> need a "lock" at all. We just need guaranteed quiescent points that we
> make sure all tasks would go through before freeing the trampolines.
> But it was decided to create a new flavor of RCU instead of doing that.

Yes, lets brain storm this if you like. One way I was thinking if we can
manually check every CPU and see what state its in (usermode, kernel, idle
etc) using an IPI mechanism. Once all CPUs have been seen to be usermode,
or idle atleast once - then we are done. You have probably already thought
about this so feel free to say why its not a good idea, but to me there are
3 places that a tasks quiescent state is recorded. During the timer tick,
during task sleep and during cond_resched_tasks_qs.  Of these, I feel only
the cond_resched case isn't trackable with and IPI mechanism.

thanks,

- Joel


Re: [PATCH 1/2] powerpc: Detect the presence of big-core with interleaved threads

2018-05-21 Thread Gautham R Shenoy
Hello Michael,

On Fri, May 18, 2018 at 11:21:22PM +1000, Michael Ellerman wrote:
> "Gautham R. Shenoy"  writes:
> 
> > diff --git a/arch/powerpc/kernel/setup-common.c 
> > b/arch/powerpc/kernel/setup-common.c
> > index 0af5c11..884dff2 100644
> > --- a/arch/powerpc/kernel/setup-common.c
> > +++ b/arch/powerpc/kernel/setup-common.c
> > @@ -436,8 +438,56 @@ static void __init cpu_init_thread_core_maps(int tpc)
> > printk(KERN_DEBUG " (thread shift is %d)\n", threads_shift);
> >  }
> >  
> > -
> >  u32 *cpu_to_phys_id = NULL;
> > +/*
> > + * check_for_interleaved_big_core - Checks if the core represented by
> > + *  dn is a big-core whose threads are interleavings of the
> > + *  threads of the component small cores.
> > + *
> > + * @dn: device node corresponding to the core.
> > + *
> > + * Returns true if the core is a interleaved big-core.
> > + * Returns false otherwise.
> > + */
> > +static inline bool check_for_interleaved_big_core(struct device_node *dn)
> > +{
> > +   int len, nr_groups, threads_per_group;
> > +   const __be32 *thread_groups;
> > +   __be32 *thread_list, *first_cpu_idx;
> > +   int cur_cpu, next_cpu, i, j;
> > +
> > +   thread_groups = of_get_property(dn, "ibm,thread-groups", );
> > +   if (!thread_groups)
> > +   return false;
> 
> There are better device tree APIs than bare of_get_property() these
> days, can you try to use those?

Ok, I will use them.

> 
> > +   nr_groups = be32_to_cpu(*(thread_groups + 1));
> > +   if (nr_groups <= 1)
> > +   return false;
> 
> eg. this would be of_property_read_u32_index()
> 

Ok.

> > @@ -565,7 +615,16 @@ void __init smp_setup_cpu_maps(void)
> > vdso_data->processorCount = num_present_cpus();
> >  #endif /* CONFIG_PPC64 */
> >  
> > -/* Initialize CPU <=> thread mapping/
> > +   dn = of_find_node_by_type(NULL, "cpu");
> > +   if (dn) {
> > +   if (check_for_interleaved_big_core(dn)) {
> > +   has_interleaved_big_core = true;
> > +   pr_info("Detected interleaved big-cores\n");
> > +   }
> > +   of_node_put(dn);
> > +   }
> 
> This is a bit untidy, given how unlikely it is that you would have no
> CPUs :)

This can actually go into the earlier loop where we initialize the
smp_processor_ids(). I have fixed it in the next iteration.

> 
> You should be able to do the lookup of the property and the setting of
> has_interleaved_big_core all inside
> check_for_interleaved_big_core().

Yes, that's what I am doing in the next iteration.

> 
> cheers
> 



Re: [PATCH v5 3/8] remoteproc: Move proxy unvote to handover irq handler

2018-05-21 Thread Bjorn Andersson
On Mon 21 May 10:27 PDT 2018, Sibi Sankar wrote:

> Introduce interrupt handler for smp2p ready interrupt to
> handle start completion. Move the proxy votes for clocks
> and regulators to the handover interrupt context.
> 
> Signed-off-by: Sibi Sankar 

I added the "synchronization logic" of patch 4 to this one, to make it
functional and after testing on db820c I merged patch 3 through 8 into
rproc-next.

I skipped the remainder of patch 4, as I think the ready_irq should be
included and I wonder if irq management can remove the need for the
"running" variable.

I also moved the enable_irq before we actually start the core, since
it's been shown downstream that the smp2p irq might fire fast enough for
us to miss it (and the smp2p driver does not redeliver interrupts).

Regards,
Bjorn

> ---
>  drivers/remoteproc/qcom_q6v5_pil.c | 29 +++--
>  1 file changed, 23 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/remoteproc/qcom_q6v5_pil.c 
> b/drivers/remoteproc/qcom_q6v5_pil.c
> index 76a0c00aa04a..6333bdcd9448 100644
> --- a/drivers/remoteproc/qcom_q6v5_pil.c
> +++ b/drivers/remoteproc/qcom_q6v5_pil.c
> @@ -809,11 +809,6 @@ static int q6v5_start(struct rproc *rproc)
>   "Failed to reclaim mba buffer system may become 
> unstable\n");
>   qproc->running = true;
>  
> - q6v5_clk_disable(qproc->dev, qproc->proxy_clks,
> -  qproc->proxy_clk_count);
> - q6v5_regulator_disable(qproc, qproc->proxy_regs,
> -qproc->proxy_reg_count);
> -
>   return 0;
>  
>  reclaim_mpss:
> @@ -892,6 +887,12 @@ static int q6v5_stop(struct rproc *rproc)
>   WARN_ON(ret);
>  
>   reset_control_assert(qproc->mss_restart);
> +
> + q6v5_clk_disable(qproc->dev, qproc->proxy_clks,
> +  qproc->proxy_clk_count);
> + q6v5_regulator_disable(qproc, qproc->proxy_regs,
> +qproc->proxy_reg_count);
> +
>   q6v5_clk_disable(qproc->dev, qproc->active_clks,
>qproc->active_clk_count);
>   q6v5_regulator_disable(qproc, qproc->active_regs,
> @@ -959,7 +960,7 @@ static irqreturn_t q6v5_fatal_interrupt(int irq, void 
> *dev)
>   return IRQ_HANDLED;
>  }
>  
> -static irqreturn_t q6v5_handover_interrupt(int irq, void *dev)
> +static irqreturn_t q6v5_ready_interrupt(int irq, void *dev)
>  {
>   struct q6v5 *qproc = dev;
>  
> @@ -967,6 +968,18 @@ static irqreturn_t q6v5_handover_interrupt(int irq, void 
> *dev)
>   return IRQ_HANDLED;
>  }
>  
> +static irqreturn_t q6v5_handover_interrupt(int irq, void *dev)
> +{
> + struct q6v5 *qproc = dev;
> +
> + q6v5_clk_disable(qproc->dev, qproc->proxy_clks,
> +  qproc->proxy_clk_count);
> + q6v5_regulator_disable(qproc, qproc->proxy_regs,
> +qproc->proxy_reg_count);
> +
> + return IRQ_HANDLED;
> +}
> +
>  static irqreturn_t q6v5_stop_ack_interrupt(int irq, void *dev)
>  {
>   struct q6v5 *qproc = dev;
> @@ -1194,6 +1207,10 @@ static int q6v5_probe(struct platform_device *pdev)
>   if (ret < 0)
>   goto free_rproc;
>  
> + ret = q6v5_request_irq(qproc, pdev, "ready", q6v5_ready_interrupt);
> + if (ret < 0)
> + goto free_rproc;
> +
>   ret = q6v5_request_irq(qproc, pdev, "handover", 
> q6v5_handover_interrupt);
>   if (ret < 0)
>   goto free_rproc;
> -- 
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
> 


Re: [PATCH 1/2] powerpc: Detect the presence of big-core with interleaved threads

2018-05-21 Thread Gautham R Shenoy
Hello Michael,

On Fri, May 18, 2018 at 11:21:22PM +1000, Michael Ellerman wrote:
> "Gautham R. Shenoy"  writes:
> 
> > diff --git a/arch/powerpc/kernel/setup-common.c 
> > b/arch/powerpc/kernel/setup-common.c
> > index 0af5c11..884dff2 100644
> > --- a/arch/powerpc/kernel/setup-common.c
> > +++ b/arch/powerpc/kernel/setup-common.c
> > @@ -436,8 +438,56 @@ static void __init cpu_init_thread_core_maps(int tpc)
> > printk(KERN_DEBUG " (thread shift is %d)\n", threads_shift);
> >  }
> >  
> > -
> >  u32 *cpu_to_phys_id = NULL;
> > +/*
> > + * check_for_interleaved_big_core - Checks if the core represented by
> > + *  dn is a big-core whose threads are interleavings of the
> > + *  threads of the component small cores.
> > + *
> > + * @dn: device node corresponding to the core.
> > + *
> > + * Returns true if the core is a interleaved big-core.
> > + * Returns false otherwise.
> > + */
> > +static inline bool check_for_interleaved_big_core(struct device_node *dn)
> > +{
> > +   int len, nr_groups, threads_per_group;
> > +   const __be32 *thread_groups;
> > +   __be32 *thread_list, *first_cpu_idx;
> > +   int cur_cpu, next_cpu, i, j;
> > +
> > +   thread_groups = of_get_property(dn, "ibm,thread-groups", );
> > +   if (!thread_groups)
> > +   return false;
> 
> There are better device tree APIs than bare of_get_property() these
> days, can you try to use those?

Ok, I will use them.

> 
> > +   nr_groups = be32_to_cpu(*(thread_groups + 1));
> > +   if (nr_groups <= 1)
> > +   return false;
> 
> eg. this would be of_property_read_u32_index()
> 

Ok.

> > @@ -565,7 +615,16 @@ void __init smp_setup_cpu_maps(void)
> > vdso_data->processorCount = num_present_cpus();
> >  #endif /* CONFIG_PPC64 */
> >  
> > -/* Initialize CPU <=> thread mapping/
> > +   dn = of_find_node_by_type(NULL, "cpu");
> > +   if (dn) {
> > +   if (check_for_interleaved_big_core(dn)) {
> > +   has_interleaved_big_core = true;
> > +   pr_info("Detected interleaved big-cores\n");
> > +   }
> > +   of_node_put(dn);
> > +   }
> 
> This is a bit untidy, given how unlikely it is that you would have no
> CPUs :)

This can actually go into the earlier loop where we initialize the
smp_processor_ids(). I have fixed it in the next iteration.

> 
> You should be able to do the lookup of the property and the setting of
> has_interleaved_big_core all inside
> check_for_interleaved_big_core().

Yes, that's what I am doing in the next iteration.

> 
> cheers
> 



Re: [PATCH v5 3/8] remoteproc: Move proxy unvote to handover irq handler

2018-05-21 Thread Bjorn Andersson
On Mon 21 May 10:27 PDT 2018, Sibi Sankar wrote:

> Introduce interrupt handler for smp2p ready interrupt to
> handle start completion. Move the proxy votes for clocks
> and regulators to the handover interrupt context.
> 
> Signed-off-by: Sibi Sankar 

I added the "synchronization logic" of patch 4 to this one, to make it
functional and after testing on db820c I merged patch 3 through 8 into
rproc-next.

I skipped the remainder of patch 4, as I think the ready_irq should be
included and I wonder if irq management can remove the need for the
"running" variable.

I also moved the enable_irq before we actually start the core, since
it's been shown downstream that the smp2p irq might fire fast enough for
us to miss it (and the smp2p driver does not redeliver interrupts).

Regards,
Bjorn

> ---
>  drivers/remoteproc/qcom_q6v5_pil.c | 29 +++--
>  1 file changed, 23 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/remoteproc/qcom_q6v5_pil.c 
> b/drivers/remoteproc/qcom_q6v5_pil.c
> index 76a0c00aa04a..6333bdcd9448 100644
> --- a/drivers/remoteproc/qcom_q6v5_pil.c
> +++ b/drivers/remoteproc/qcom_q6v5_pil.c
> @@ -809,11 +809,6 @@ static int q6v5_start(struct rproc *rproc)
>   "Failed to reclaim mba buffer system may become 
> unstable\n");
>   qproc->running = true;
>  
> - q6v5_clk_disable(qproc->dev, qproc->proxy_clks,
> -  qproc->proxy_clk_count);
> - q6v5_regulator_disable(qproc, qproc->proxy_regs,
> -qproc->proxy_reg_count);
> -
>   return 0;
>  
>  reclaim_mpss:
> @@ -892,6 +887,12 @@ static int q6v5_stop(struct rproc *rproc)
>   WARN_ON(ret);
>  
>   reset_control_assert(qproc->mss_restart);
> +
> + q6v5_clk_disable(qproc->dev, qproc->proxy_clks,
> +  qproc->proxy_clk_count);
> + q6v5_regulator_disable(qproc, qproc->proxy_regs,
> +qproc->proxy_reg_count);
> +
>   q6v5_clk_disable(qproc->dev, qproc->active_clks,
>qproc->active_clk_count);
>   q6v5_regulator_disable(qproc, qproc->active_regs,
> @@ -959,7 +960,7 @@ static irqreturn_t q6v5_fatal_interrupt(int irq, void 
> *dev)
>   return IRQ_HANDLED;
>  }
>  
> -static irqreturn_t q6v5_handover_interrupt(int irq, void *dev)
> +static irqreturn_t q6v5_ready_interrupt(int irq, void *dev)
>  {
>   struct q6v5 *qproc = dev;
>  
> @@ -967,6 +968,18 @@ static irqreturn_t q6v5_handover_interrupt(int irq, void 
> *dev)
>   return IRQ_HANDLED;
>  }
>  
> +static irqreturn_t q6v5_handover_interrupt(int irq, void *dev)
> +{
> + struct q6v5 *qproc = dev;
> +
> + q6v5_clk_disable(qproc->dev, qproc->proxy_clks,
> +  qproc->proxy_clk_count);
> + q6v5_regulator_disable(qproc, qproc->proxy_regs,
> +qproc->proxy_reg_count);
> +
> + return IRQ_HANDLED;
> +}
> +
>  static irqreturn_t q6v5_stop_ack_interrupt(int irq, void *dev)
>  {
>   struct q6v5 *qproc = dev;
> @@ -1194,6 +1207,10 @@ static int q6v5_probe(struct platform_device *pdev)
>   if (ret < 0)
>   goto free_rproc;
>  
> + ret = q6v5_request_irq(qproc, pdev, "ready", q6v5_ready_interrupt);
> + if (ret < 0)
> + goto free_rproc;
> +
>   ret = q6v5_request_irq(qproc, pdev, "handover", 
> q6v5_handover_interrupt);
>   if (ret < 0)
>   goto free_rproc;
> -- 
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
> 


Re: [PATCH 1/2] powerpc: Detect the presence of big-core with interleaved threads

2018-05-21 Thread Gautham R Shenoy
Hello Michael,

On Fri, May 18, 2018 at 11:14:04PM +1000, Michael Ellerman wrote:
> Gautham R Shenoy  writes:
> ...
> >> > @@ -565,7 +615,16 @@ void __init smp_setup_cpu_maps(void)
> >> >  vdso_data->processorCount = num_present_cpus();
> >> >  #endif /* CONFIG_PPC64 */
> >> >  
> >> > -/* Initialize CPU <=> thread mapping/
> >> > +dn = of_find_node_by_type(NULL, "cpu");
> >> > +if (dn) {
> >> > +if (check_for_interleaved_big_core(dn)) {
> >> > +has_interleaved_big_core = true;
> >> > +pr_info("Detected interleaved big-cores\n");
> >> 
> >> Is there a runtime way to check this also?  If the dmesg buffer overflows, 
> >> we
> >> lose this.
> >
> > Where do you suggest we put this ? Should it be a part of
> > /proc/cpuinfo ?
> 
> Hmm, it'd be nice not to pollute it with more junk.
> 
> Can you just look at the pir files in sysfs?

Sure Michael. I will explore this option.

If we add a file called l1cache_thread_group, then the siblings of the
big-core that share the L1-cache can be described as follows.

# cd  /sys/devices/system/cpu
# grep . cpu[0-7]/l1cache_thread_group
cpu0/l1cache_thread_group:0,2,4,6
cpu1/l1cache_thread_group:1,3,5,7
cpu2/l1cache_thread_group:0,2,4,6
cpu3/l1cache_thread_group:1,3,5,7
cpu4/l1cache_thread_group:0,2,4,6
cpu5/l1cache_thread_group:1,3,5,7
cpu6/l1cache_thread_group:0,2,4,6
cpu7/l1cache_thread_group:1,3,5,7

> 
> eg. on a normal system:
> 
>   # cd /sys/devices/system/cpu
>   # grep . cpu[0-7]/pir
>   cpu0/pir:20
>   cpu1/pir:21
>   cpu2/pir:22
>   cpu3/pir:23
>   cpu4/pir:24
>   cpu5/pir:25
>   cpu6/pir:26
>   cpu7/pir:27


> 
> 
> cheers
> 



Re: [PATCH 1/2] powerpc: Detect the presence of big-core with interleaved threads

2018-05-21 Thread Gautham R Shenoy
Hello Michael,

On Fri, May 18, 2018 at 11:14:04PM +1000, Michael Ellerman wrote:
> Gautham R Shenoy  writes:
> ...
> >> > @@ -565,7 +615,16 @@ void __init smp_setup_cpu_maps(void)
> >> >  vdso_data->processorCount = num_present_cpus();
> >> >  #endif /* CONFIG_PPC64 */
> >> >  
> >> > -/* Initialize CPU <=> thread mapping/
> >> > +dn = of_find_node_by_type(NULL, "cpu");
> >> > +if (dn) {
> >> > +if (check_for_interleaved_big_core(dn)) {
> >> > +has_interleaved_big_core = true;
> >> > +pr_info("Detected interleaved big-cores\n");
> >> 
> >> Is there a runtime way to check this also?  If the dmesg buffer overflows, 
> >> we
> >> lose this.
> >
> > Where do you suggest we put this ? Should it be a part of
> > /proc/cpuinfo ?
> 
> Hmm, it'd be nice not to pollute it with more junk.
> 
> Can you just look at the pir files in sysfs?

Sure Michael. I will explore this option.

If we add a file called l1cache_thread_group, then the siblings of the
big-core that share the L1-cache can be described as follows.

# cd  /sys/devices/system/cpu
# grep . cpu[0-7]/l1cache_thread_group
cpu0/l1cache_thread_group:0,2,4,6
cpu1/l1cache_thread_group:1,3,5,7
cpu2/l1cache_thread_group:0,2,4,6
cpu3/l1cache_thread_group:1,3,5,7
cpu4/l1cache_thread_group:0,2,4,6
cpu5/l1cache_thread_group:1,3,5,7
cpu6/l1cache_thread_group:0,2,4,6
cpu7/l1cache_thread_group:1,3,5,7

> 
> eg. on a normal system:
> 
>   # cd /sys/devices/system/cpu
>   # grep . cpu[0-7]/pir
>   cpu0/pir:20
>   cpu1/pir:21
>   cpu2/pir:22
>   cpu3/pir:23
>   cpu4/pir:24
>   cpu5/pir:25
>   cpu6/pir:26
>   cpu7/pir:27


> 
> 
> cheers
> 



[PATCH] fs/affs: fix potential memory leak in option parsing

2018-05-21 Thread Chengguang Xu
When specifying option 'prefix' multiple times,
current option parsing will cause memory leak.
Hence, call kfree for previous one in this case.

Signed-off-by: Chengguang Xu 
---
 fs/affs/super.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/affs/super.c b/fs/affs/super.c
index e602619..d1ad11a 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -241,6 +241,7 @@ enum {
affs_set_opt(*mount_opts, SF_NO_TRUNCATE);
break;
case Opt_prefix:
+   kfree(*prefix);
*prefix = match_strdup([0]);
if (!*prefix)
return 0;
-- 
1.8.3.1



[PATCH] fs/affs: fix potential memory leak in option parsing

2018-05-21 Thread Chengguang Xu
When specifying option 'prefix' multiple times,
current option parsing will cause memory leak.
Hence, call kfree for previous one in this case.

Signed-off-by: Chengguang Xu 
---
 fs/affs/super.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/affs/super.c b/fs/affs/super.c
index e602619..d1ad11a 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -241,6 +241,7 @@ enum {
affs_set_opt(*mount_opts, SF_NO_TRUNCATE);
break;
case Opt_prefix:
+   kfree(*prefix);
*prefix = match_strdup([0]);
if (!*prefix)
return 0;
-- 
1.8.3.1



[PATCH RFC 3/3] scsi: ufs: Add sysfs support for ufs provision

2018-05-21 Thread Sayali Lokhande
Add sysfs support to trigger ufs provisioning at runtime.
Usage :
echo  > /sys/bus/platform/devices/1d84000.ufshcd/ufs_provision
To check provisioning status:
cat /sys/bus/platform/devices/1d84000.ufshc/ufs_provision
1 -> Success (Reboot device to check updated provisioning)

Signed-off-by: Sayali Lokhande 
---
 drivers/scsi/ufs/ufs.h|   2 +
 drivers/scsi/ufs/ufshcd.c | 125 +-
 drivers/scsi/ufs/ufshcd.h |   6 +++
 3 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/ufs/ufs.h b/drivers/scsi/ufs/ufs.h
index 1f99904..0b497fc 100644
--- a/drivers/scsi/ufs/ufs.h
+++ b/drivers/scsi/ufs/ufs.h
@@ -427,6 +427,7 @@ enum {
 };
 
 struct ufs_unit_desc {
+   u8 LUNum;
u8 bLUEnable;  /* 1 for enabled LU */
u8 bBootLunID; /* 0 for using this LU for boot */
u8 bLUWriteProtect;/* 1 = power on WP, 2 = permanent WP */
@@ -451,6 +452,7 @@ struct ufs_config_descr {
u32qVendorConfigCode;  /* Vendor specific configuration code */
struct ufs_unit_desc unit[8];
u8  lun_to_grow;
+   u8 num_luns;
 };
 
 /* Task management service response */
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 9ae64e2..0c94885 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -1580,6 +1580,106 @@ void ufshcd_release(struct ufs_hba *hba)
 }
 EXPORT_SYMBOL_GPL(ufshcd_release);
 
+static ssize_t ufshcd_desc_config_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct ufs_hba *hba = dev_get_drvdata(dev);
+
+   return snprintf(buf, PAGE_SIZE, "%d\n", hba->ufs_provision.is_enabled);
+}
+
+static ssize_t ufshcd_desc_config_store(struct device *dev,
+   struct device_attribute *attr, const char *buf, size_t count)
+{
+   struct ufs_hba *hba = dev_get_drvdata(dev);
+   struct ufs_config_descr *cfg = >cfgs;
+   char *strbuf;
+   char *strbuf_copy;
+   int desc_buf[count];
+   int *pt;
+   char *token;
+   int i, ret;
+   int value, commit = 0;
+   int num_luns = 0;
+   int KB_per_block = 4;
+
+   /* reserve one byte for null termination */
+   strbuf = kmalloc(count + 1, GFP_KERNEL);
+   if (!strbuf)
+   return -ENOMEM;
+
+   strbuf_copy = strbuf;
+   strlcpy(strbuf, buf, count + 1);
+
+   for (i = 0; i < count; i++) {
+   token = strsep(, " ");
+   if (!token) {
+   num_luns = desc_buf[i-1];
+   dev_dbg(hba->dev, "%s: token %s, num_luns %d\n",
+   __func__, token, num_luns);
+   break;
+   }
+
+   ret = kstrtoint(token, 0, );
+   if (ret) {
+   dev_err(hba->dev, "%s: kstrtoint failed %d %s\n",
+   __func__, ret, token);
+   break;
+   }
+   desc_buf[i] = value;
+   dev_dbg(hba->dev, " desc_buf[%d] 0x%x", i, desc_buf[i]);
+   }
+
+   /* Fill in the descriptors with parsed configuration data */
+   pt = desc_buf;
+   cfg->bNumberLU = *pt++;
+   cfg->bBootEnable = *pt++;
+   cfg->bDescrAccessEn = *pt++;
+   cfg->bInitPowerMode = *pt++;
+   cfg->bHighPriorityLUN = *pt++;
+   cfg->bSecureRemovalType = *pt++;
+   cfg->bInitActiveICCLevel = *pt++;
+   cfg->wPeriodicRTCUpdate = *pt++;
+   cfg->bConfigDescrLock = *pt++;
+   dev_dbg(hba->dev, "%s: %u %u %u %u %u %u %u %u %u\n", __func__,
+   cfg->bNumberLU, cfg->bBootEnable, cfg->bDescrAccessEn,
+   cfg->bInitPowerMode, cfg->bHighPriorityLUN, cfg->bSecureRemovalType,
+   cfg->bInitActiveICCLevel, cfg->wPeriodicRTCUpdate,
+   cfg->bConfigDescrLock);
+
+   for (i = 0; i < num_luns; i++) {
+   cfg->unit[i].LUNum = *pt++;
+   cfg->unit[i].bLUEnable = *pt++;
+   cfg->unit[i].bBootLunID = *pt++;
+   /* dNumAllocUnits = size_in_kb/KB_per_block */
+   cfg->unit[i].dNumAllocUnits = (u32)(*pt++ / KB_per_block);
+   cfg->unit[i].bDataReliability = *pt++;
+   cfg->unit[i].bLUWriteProtect = *pt++;
+   cfg->unit[i].bMemoryType = *pt++;
+   cfg->unit[i].bLogicalBlockSize = *pt++;
+   cfg->unit[i].bProvisioningType = *pt++;
+   cfg->unit[i].wContextCapabilities = *pt++;
+   }
+
+   cfg->lun_to_grow = *pt++;
+   commit = *pt++;
+   cfg->num_luns = *pt;
+   dev_dbg(hba->dev, "%s: lun_to_grow %u, commit %u num_luns %u\n",
+   __func__, cfg->lun_to_grow, commit, cfg->num_luns);
+   if (commit == 1) {
+   ret = ufshcd_do_config_device(hba);
+   if (!ret) {
+   hba->ufs_provision.is_enabled = 1;
+ 

[PATCH RFC 3/3] scsi: ufs: Add sysfs support for ufs provision

2018-05-21 Thread Sayali Lokhande
Add sysfs support to trigger ufs provisioning at runtime.
Usage :
echo  > /sys/bus/platform/devices/1d84000.ufshcd/ufs_provision
To check provisioning status:
cat /sys/bus/platform/devices/1d84000.ufshc/ufs_provision
1 -> Success (Reboot device to check updated provisioning)

Signed-off-by: Sayali Lokhande 
---
 drivers/scsi/ufs/ufs.h|   2 +
 drivers/scsi/ufs/ufshcd.c | 125 +-
 drivers/scsi/ufs/ufshcd.h |   6 +++
 3 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/ufs/ufs.h b/drivers/scsi/ufs/ufs.h
index 1f99904..0b497fc 100644
--- a/drivers/scsi/ufs/ufs.h
+++ b/drivers/scsi/ufs/ufs.h
@@ -427,6 +427,7 @@ enum {
 };
 
 struct ufs_unit_desc {
+   u8 LUNum;
u8 bLUEnable;  /* 1 for enabled LU */
u8 bBootLunID; /* 0 for using this LU for boot */
u8 bLUWriteProtect;/* 1 = power on WP, 2 = permanent WP */
@@ -451,6 +452,7 @@ struct ufs_config_descr {
u32qVendorConfigCode;  /* Vendor specific configuration code */
struct ufs_unit_desc unit[8];
u8  lun_to_grow;
+   u8 num_luns;
 };
 
 /* Task management service response */
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 9ae64e2..0c94885 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -1580,6 +1580,106 @@ void ufshcd_release(struct ufs_hba *hba)
 }
 EXPORT_SYMBOL_GPL(ufshcd_release);
 
+static ssize_t ufshcd_desc_config_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct ufs_hba *hba = dev_get_drvdata(dev);
+
+   return snprintf(buf, PAGE_SIZE, "%d\n", hba->ufs_provision.is_enabled);
+}
+
+static ssize_t ufshcd_desc_config_store(struct device *dev,
+   struct device_attribute *attr, const char *buf, size_t count)
+{
+   struct ufs_hba *hba = dev_get_drvdata(dev);
+   struct ufs_config_descr *cfg = >cfgs;
+   char *strbuf;
+   char *strbuf_copy;
+   int desc_buf[count];
+   int *pt;
+   char *token;
+   int i, ret;
+   int value, commit = 0;
+   int num_luns = 0;
+   int KB_per_block = 4;
+
+   /* reserve one byte for null termination */
+   strbuf = kmalloc(count + 1, GFP_KERNEL);
+   if (!strbuf)
+   return -ENOMEM;
+
+   strbuf_copy = strbuf;
+   strlcpy(strbuf, buf, count + 1);
+
+   for (i = 0; i < count; i++) {
+   token = strsep(, " ");
+   if (!token) {
+   num_luns = desc_buf[i-1];
+   dev_dbg(hba->dev, "%s: token %s, num_luns %d\n",
+   __func__, token, num_luns);
+   break;
+   }
+
+   ret = kstrtoint(token, 0, );
+   if (ret) {
+   dev_err(hba->dev, "%s: kstrtoint failed %d %s\n",
+   __func__, ret, token);
+   break;
+   }
+   desc_buf[i] = value;
+   dev_dbg(hba->dev, " desc_buf[%d] 0x%x", i, desc_buf[i]);
+   }
+
+   /* Fill in the descriptors with parsed configuration data */
+   pt = desc_buf;
+   cfg->bNumberLU = *pt++;
+   cfg->bBootEnable = *pt++;
+   cfg->bDescrAccessEn = *pt++;
+   cfg->bInitPowerMode = *pt++;
+   cfg->bHighPriorityLUN = *pt++;
+   cfg->bSecureRemovalType = *pt++;
+   cfg->bInitActiveICCLevel = *pt++;
+   cfg->wPeriodicRTCUpdate = *pt++;
+   cfg->bConfigDescrLock = *pt++;
+   dev_dbg(hba->dev, "%s: %u %u %u %u %u %u %u %u %u\n", __func__,
+   cfg->bNumberLU, cfg->bBootEnable, cfg->bDescrAccessEn,
+   cfg->bInitPowerMode, cfg->bHighPriorityLUN, cfg->bSecureRemovalType,
+   cfg->bInitActiveICCLevel, cfg->wPeriodicRTCUpdate,
+   cfg->bConfigDescrLock);
+
+   for (i = 0; i < num_luns; i++) {
+   cfg->unit[i].LUNum = *pt++;
+   cfg->unit[i].bLUEnable = *pt++;
+   cfg->unit[i].bBootLunID = *pt++;
+   /* dNumAllocUnits = size_in_kb/KB_per_block */
+   cfg->unit[i].dNumAllocUnits = (u32)(*pt++ / KB_per_block);
+   cfg->unit[i].bDataReliability = *pt++;
+   cfg->unit[i].bLUWriteProtect = *pt++;
+   cfg->unit[i].bMemoryType = *pt++;
+   cfg->unit[i].bLogicalBlockSize = *pt++;
+   cfg->unit[i].bProvisioningType = *pt++;
+   cfg->unit[i].wContextCapabilities = *pt++;
+   }
+
+   cfg->lun_to_grow = *pt++;
+   commit = *pt++;
+   cfg->num_luns = *pt;
+   dev_dbg(hba->dev, "%s: lun_to_grow %u, commit %u num_luns %u\n",
+   __func__, cfg->lun_to_grow, commit, cfg->num_luns);
+   if (commit == 1) {
+   ret = ufshcd_do_config_device(hba);
+   if (!ret) {
+   hba->ufs_provision.is_enabled = 1;
+   dev_err(hba->dev,
+   

[PATCH RFC 2/3] scsi: ufs: Add ufs provisioning support

2018-05-21 Thread Sayali Lokhande
A new api ufshcd_do_config_device() is added in driver
to suppoet UFS provisioning at runtime. Sysfs support
is added to trigger provisioning.
Device configuration descriptors or parameters are
parsed from vendor specific provisioning data and
passed via sysfs at runtime to provision ufs device.

Signed-off-by: Sayali Lokhande 
---
 drivers/scsi/ufs/ufs.h|  28 +++
 drivers/scsi/ufs/ufshcd.c | 200 ++
 drivers/scsi/ufs/ufshcd.h |   1 +
 3 files changed, 229 insertions(+)

diff --git a/drivers/scsi/ufs/ufs.h b/drivers/scsi/ufs/ufs.h
index e15deb0..1f99904 100644
--- a/drivers/scsi/ufs/ufs.h
+++ b/drivers/scsi/ufs/ufs.h
@@ -333,6 +333,7 @@ enum {
UFSHCD_AMP  = 3,
 };
 
+#define UFS_BLOCK_SIZE 4096
 #define POWER_DESC_MAX_SIZE0x62
 #define POWER_DESC_MAX_ACTV_ICC_LVLS   16
 
@@ -425,6 +426,33 @@ enum {
MASK_TM_SERVICE_RESP= 0xFF,
 };
 
+struct ufs_unit_desc {
+   u8 bLUEnable;  /* 1 for enabled LU */
+   u8 bBootLunID; /* 0 for using this LU for boot */
+   u8 bLUWriteProtect;/* 1 = power on WP, 2 = permanent WP */
+   u8 bMemoryType;/* 0 for enhanced memory type */
+   u32dNumAllocUnits; /* Number of alloc unit for this LU */
+   u8 bDataReliability;   /* 0 for reliable write support */
+   u8 bLogicalBlockSize;  /* See section 13.2.3 of UFS standard */
+   u8 bProvisioningType;  /* 0 for thin provisioning */
+   u16wContextCapabilities;   /* refer Unit Descriptor Description */
+};
+
+struct ufs_config_descr {
+   u8 bNumberLU;  /* Total number of active LUs */
+   u8 bBootEnable;/* enabling device for partial init */
+   u8 bDescrAccessEn; /* desc access during partial init */
+   u8 bInitPowerMode; /* Initial device power mode */
+   u8 bHighPriorityLUN;   /* LUN of the high priority LU */
+   u8 bSecureRemovalType; /* Erase config for data removal */
+   u8 bInitActiveICCLevel;/* ICC level after reset */
+   u16wPeriodicRTCUpdate; /* 0 to set a priodic RTC update rate */
+   u32 bConfigDescrLock;  /* 1 to lock Configation Descriptor */
+   u32qVendorConfigCode;  /* Vendor specific configuration code */
+   struct ufs_unit_desc unit[8];
+   u8  lun_to_grow;
+};
+
 /* Task management service response */
 enum {
UPIU_TASK_MANAGEMENT_FUNC_COMPL = 0x00,
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 1ab882f..9ae64e2 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -237,6 +237,7 @@ static int ufshcd_config_pwr_mode(struct ufs_hba *hba,
struct ufs_pa_layer_attr *desired_pwr_mode);
 static int ufshcd_change_power_mode(struct ufs_hba *hba,
 struct ufs_pa_layer_attr *pwr_mode);
+static int ufshcd_do_config_device(struct ufs_hba *hba);
 static inline bool ufshcd_valid_tag(struct ufs_hba *hba, int tag)
 {
return tag >= 0 && tag < hba->nutrs;
@@ -3063,6 +3064,14 @@ static inline int ufshcd_read_power_desc(struct ufs_hba 
*hba,
return ufshcd_read_desc(hba, QUERY_DESC_IDN_POWER, 0, buf, size);
 }
 
+static inline int ufshcd_read_geometry_desc(struct ufs_hba *hba,
+u8 *buf,
+u32 size)
+{
+   return ufshcd_read_desc(hba, QUERY_DESC_IDN_GEOMETRY, 0, buf, size);
+}
+
+
 static int ufshcd_read_device_desc(struct ufs_hba *hba, u8 *buf, u32 size)
 {
return ufshcd_read_desc(hba, QUERY_DESC_IDN_DEVICE, 0, buf, size);
@@ -6352,6 +6361,197 @@ static int ufshcd_set_dev_ref_clk(struct ufs_hba *hba)
 }
 
 /**
+ * ufshcd_do_config_device - API function for UFS provisioning
+ * hba: per-adapter instance
+ * Returns 0 for success, non-zero in case of failure.
+ */
+static int ufshcd_do_config_device(struct ufs_hba *hba)
+{
+   struct ufs_config_descr *cfg = >cfgs;
+   int buff_len = QUERY_DESC_CONFIGURATION_DEF_SIZE;
+   u8 desc_buf[QUERY_DESC_CONFIGURATION_DEF_SIZE] = {0};
+   int i, ret = 0;
+   int lun_to_grow = -1;
+   u64 qTotalRawDeviceCapacity;
+   u16 wEnhanced1CapAdjFac, wEnhanced2CapAdjFac;
+   u32 dEnhanced1MaxNAllocU, dEnhanced2MaxNAllocU;
+   size_t alloc_units, units_to_create = 0;
+   size_t capacity_to_alloc_factor;
+   size_t enhanced1_units = 0, enhanced2_units = 0;
+   size_t conversion_ratio = 1;
+   u8 *pt;
+   u32 blocks_per_alloc_unit = 1024;
+   int geo_len = hba->desc_size.geom_desc;
+   u8 geo_buf[hba->desc_size.geom_desc];
+   unsigned int max_partitions = 9;
+
+   WARN_ON(!hba || !cfg);
+   ufshcd_set_dev_ref_clk(hba);
+
+   ret = ufshcd_read_geometry_desc(hba, geo_buf, geo_len);
+   

[PATCH RFC 1/3] scsi: ufs: set the device reference clock setting

2018-05-21 Thread Sayali Lokhande
From: Subhash Jadavani 

UFS host supplies the reference clock to UFS device and UFS device
specification allows host to provide one of the 4 frequencies (19.2 MHz,
26 MHz, 38.4 MHz, 52 MHz) for reference clock. Host should set the
device reference clock frequency setting in the device based on what
frequency it is supplying to UFS device.

Signed-off-by: Subhash Jadavani 
[c...@codeaurora.org: Resolved trivial merge conflicts]
Signed-off-by: Can Guo 
Signed-off-by: Sayali Lokhande 
---
 .../devicetree/bindings/ufs/ufshcd-pltfrm.txt  |  8 +++
 drivers/scsi/ufs/ufs.h |  9 
 drivers/scsi/ufs/ufshcd-pltfrm.c   | 20 
 drivers/scsi/ufs/ufshcd.c  | 60 ++
 drivers/scsi/ufs/ufshcd.h  |  2 +
 5 files changed, 99 insertions(+)

diff --git a/Documentation/devicetree/bindings/ufs/ufshcd-pltfrm.txt 
b/Documentation/devicetree/bindings/ufs/ufshcd-pltfrm.txt
index c39dfef..ac94220 100644
--- a/Documentation/devicetree/bindings/ufs/ufshcd-pltfrm.txt
+++ b/Documentation/devicetree/bindings/ufs/ufshcd-pltfrm.txt
@@ -41,6 +41,12 @@ Optional properties:
 -lanes-per-direction   : number of lanes available per direction - either 1 or 
2.
  Note that it is assume same number of lanes is used 
both
  directions at once. If not specified, default is 2 
lanes per direction.
+- dev-ref-clk-freq : Specify the device reference clock frequency, must be 
one of the following:
+ 0: 19.2 MHz
+ 1: 26 MHz
+ 2: 38.4 MHz
+ 3: 52 MHz
+ Defaults to 26 MHz if not specified.
 
 Note: If above properties are not defined it can be assumed that the supply
 regulators or clocks are always on.
@@ -66,4 +72,6 @@ Example:
freq-table-hz = <1 2>, <0 0>, <0 0>;
phys = <>;
phy-names = "ufsphy";
+   dev-ref-clk-freq = <0>; /* reference clock freq: 19.2 MHz */
};
+
diff --git a/drivers/scsi/ufs/ufs.h b/drivers/scsi/ufs/ufs.h
index 14e5bf7..e15deb0 100644
--- a/drivers/scsi/ufs/ufs.h
+++ b/drivers/scsi/ufs/ufs.h
@@ -378,6 +378,15 @@ enum query_opcode {
UPIU_QUERY_OPCODE_TOGGLE_FLAG   = 0x8,
 };
 
+/* bRefClkFreq attribute values */
+enum ref_clk_freq {
+   REF_CLK_FREQ_19_2_MHZ   = 0x0,
+   REF_CLK_FREQ_26_MHZ = 0x1,
+   REF_CLK_FREQ_38_4_MHZ   = 0x2,
+   REF_CLK_FREQ_52_MHZ = 0x3,
+   REF_CLK_FREQ_MAX= REF_CLK_FREQ_52_MHZ,
+};
+
 /* Query response result code */
 enum {
QUERY_RESULT_SUCCESS= 0x00,
diff --git a/drivers/scsi/ufs/ufshcd-pltfrm.c b/drivers/scsi/ufs/ufshcd-pltfrm.c
index e82bde0..b70838b 100644
--- a/drivers/scsi/ufs/ufshcd-pltfrm.c
+++ b/drivers/scsi/ufs/ufshcd-pltfrm.c
@@ -221,6 +221,24 @@ static int ufshcd_parse_regulator_info(struct ufs_hba *hba)
return err;
 }
 
+static void ufshcd_parse_dev_ref_clk_freq(struct ufs_hba *hba)
+{
+   struct device *dev = hba->dev;
+   struct device_node *np = dev->of_node;
+   int ret;
+
+   if (!np)
+   return;
+
+   ret = of_property_read_u32(np, "dev-ref-clk-freq",
+  >dev_ref_clk_freq);
+   if (ret ||
+   (hba->dev_ref_clk_freq < 0) ||
+   (hba->dev_ref_clk_freq > REF_CLK_FREQ_52_MHZ))
+   /* default setting */
+   hba->dev_ref_clk_freq = REF_CLK_FREQ_26_MHZ;
+}
+
 #ifdef CONFIG_PM
 /**
  * ufshcd_pltfrm_suspend - suspend power management function
@@ -343,6 +361,8 @@ int ufshcd_pltfrm_init(struct platform_device *pdev,
pm_runtime_set_active(>dev);
pm_runtime_enable(>dev);
 
+   ufshcd_parse_dev_ref_clk_freq(hba);
+
ufshcd_init_lanes_per_dir(hba);
 
err = ufshcd_init(hba, mmio_base, irq);
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index c5b1bf1..1ab882f 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -6297,6 +6297,61 @@ static void ufshcd_def_desc_sizes(struct ufs_hba *hba)
 }
 
 /**
+ * ufshcd_set_dev_ref_clk - set the device bRefClkFreq
+ * @hba: per-adapter instance
+ *
+ * Read the current value of the bRefClkFreq attribute from device and update 
it
+ * if host is supplying different reference clock frequency than one mentioned
+ * in bRefClkFreq attribute.
+ *
+ * Returns zero on success, non-zero error value on failure.
+ */
+static int ufshcd_set_dev_ref_clk(struct ufs_hba *hba)
+{
+   int err = 0;
+   int ref_clk = -1;
+   static const char * const ref_clk_freqs[] = {"19.2 MHz", "26 MHz",
+"38.4 MHz", "52 MHz"};
+
+   err = ufshcd_query_attr_retry(hba, UPIU_QUERY_OPCODE_READ_ATTR,
+

[PATCH RFC 2/3] scsi: ufs: Add ufs provisioning support

2018-05-21 Thread Sayali Lokhande
A new api ufshcd_do_config_device() is added in driver
to suppoet UFS provisioning at runtime. Sysfs support
is added to trigger provisioning.
Device configuration descriptors or parameters are
parsed from vendor specific provisioning data and
passed via sysfs at runtime to provision ufs device.

Signed-off-by: Sayali Lokhande 
---
 drivers/scsi/ufs/ufs.h|  28 +++
 drivers/scsi/ufs/ufshcd.c | 200 ++
 drivers/scsi/ufs/ufshcd.h |   1 +
 3 files changed, 229 insertions(+)

diff --git a/drivers/scsi/ufs/ufs.h b/drivers/scsi/ufs/ufs.h
index e15deb0..1f99904 100644
--- a/drivers/scsi/ufs/ufs.h
+++ b/drivers/scsi/ufs/ufs.h
@@ -333,6 +333,7 @@ enum {
UFSHCD_AMP  = 3,
 };
 
+#define UFS_BLOCK_SIZE 4096
 #define POWER_DESC_MAX_SIZE0x62
 #define POWER_DESC_MAX_ACTV_ICC_LVLS   16
 
@@ -425,6 +426,33 @@ enum {
MASK_TM_SERVICE_RESP= 0xFF,
 };
 
+struct ufs_unit_desc {
+   u8 bLUEnable;  /* 1 for enabled LU */
+   u8 bBootLunID; /* 0 for using this LU for boot */
+   u8 bLUWriteProtect;/* 1 = power on WP, 2 = permanent WP */
+   u8 bMemoryType;/* 0 for enhanced memory type */
+   u32dNumAllocUnits; /* Number of alloc unit for this LU */
+   u8 bDataReliability;   /* 0 for reliable write support */
+   u8 bLogicalBlockSize;  /* See section 13.2.3 of UFS standard */
+   u8 bProvisioningType;  /* 0 for thin provisioning */
+   u16wContextCapabilities;   /* refer Unit Descriptor Description */
+};
+
+struct ufs_config_descr {
+   u8 bNumberLU;  /* Total number of active LUs */
+   u8 bBootEnable;/* enabling device for partial init */
+   u8 bDescrAccessEn; /* desc access during partial init */
+   u8 bInitPowerMode; /* Initial device power mode */
+   u8 bHighPriorityLUN;   /* LUN of the high priority LU */
+   u8 bSecureRemovalType; /* Erase config for data removal */
+   u8 bInitActiveICCLevel;/* ICC level after reset */
+   u16wPeriodicRTCUpdate; /* 0 to set a priodic RTC update rate */
+   u32 bConfigDescrLock;  /* 1 to lock Configation Descriptor */
+   u32qVendorConfigCode;  /* Vendor specific configuration code */
+   struct ufs_unit_desc unit[8];
+   u8  lun_to_grow;
+};
+
 /* Task management service response */
 enum {
UPIU_TASK_MANAGEMENT_FUNC_COMPL = 0x00,
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 1ab882f..9ae64e2 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -237,6 +237,7 @@ static int ufshcd_config_pwr_mode(struct ufs_hba *hba,
struct ufs_pa_layer_attr *desired_pwr_mode);
 static int ufshcd_change_power_mode(struct ufs_hba *hba,
 struct ufs_pa_layer_attr *pwr_mode);
+static int ufshcd_do_config_device(struct ufs_hba *hba);
 static inline bool ufshcd_valid_tag(struct ufs_hba *hba, int tag)
 {
return tag >= 0 && tag < hba->nutrs;
@@ -3063,6 +3064,14 @@ static inline int ufshcd_read_power_desc(struct ufs_hba 
*hba,
return ufshcd_read_desc(hba, QUERY_DESC_IDN_POWER, 0, buf, size);
 }
 
+static inline int ufshcd_read_geometry_desc(struct ufs_hba *hba,
+u8 *buf,
+u32 size)
+{
+   return ufshcd_read_desc(hba, QUERY_DESC_IDN_GEOMETRY, 0, buf, size);
+}
+
+
 static int ufshcd_read_device_desc(struct ufs_hba *hba, u8 *buf, u32 size)
 {
return ufshcd_read_desc(hba, QUERY_DESC_IDN_DEVICE, 0, buf, size);
@@ -6352,6 +6361,197 @@ static int ufshcd_set_dev_ref_clk(struct ufs_hba *hba)
 }
 
 /**
+ * ufshcd_do_config_device - API function for UFS provisioning
+ * hba: per-adapter instance
+ * Returns 0 for success, non-zero in case of failure.
+ */
+static int ufshcd_do_config_device(struct ufs_hba *hba)
+{
+   struct ufs_config_descr *cfg = >cfgs;
+   int buff_len = QUERY_DESC_CONFIGURATION_DEF_SIZE;
+   u8 desc_buf[QUERY_DESC_CONFIGURATION_DEF_SIZE] = {0};
+   int i, ret = 0;
+   int lun_to_grow = -1;
+   u64 qTotalRawDeviceCapacity;
+   u16 wEnhanced1CapAdjFac, wEnhanced2CapAdjFac;
+   u32 dEnhanced1MaxNAllocU, dEnhanced2MaxNAllocU;
+   size_t alloc_units, units_to_create = 0;
+   size_t capacity_to_alloc_factor;
+   size_t enhanced1_units = 0, enhanced2_units = 0;
+   size_t conversion_ratio = 1;
+   u8 *pt;
+   u32 blocks_per_alloc_unit = 1024;
+   int geo_len = hba->desc_size.geom_desc;
+   u8 geo_buf[hba->desc_size.geom_desc];
+   unsigned int max_partitions = 9;
+
+   WARN_ON(!hba || !cfg);
+   ufshcd_set_dev_ref_clk(hba);
+
+   ret = ufshcd_read_geometry_desc(hba, geo_buf, geo_len);
+   if (ret) {
+ 

[PATCH RFC 1/3] scsi: ufs: set the device reference clock setting

2018-05-21 Thread Sayali Lokhande
From: Subhash Jadavani 

UFS host supplies the reference clock to UFS device and UFS device
specification allows host to provide one of the 4 frequencies (19.2 MHz,
26 MHz, 38.4 MHz, 52 MHz) for reference clock. Host should set the
device reference clock frequency setting in the device based on what
frequency it is supplying to UFS device.

Signed-off-by: Subhash Jadavani 
[c...@codeaurora.org: Resolved trivial merge conflicts]
Signed-off-by: Can Guo 
Signed-off-by: Sayali Lokhande 
---
 .../devicetree/bindings/ufs/ufshcd-pltfrm.txt  |  8 +++
 drivers/scsi/ufs/ufs.h |  9 
 drivers/scsi/ufs/ufshcd-pltfrm.c   | 20 
 drivers/scsi/ufs/ufshcd.c  | 60 ++
 drivers/scsi/ufs/ufshcd.h  |  2 +
 5 files changed, 99 insertions(+)

diff --git a/Documentation/devicetree/bindings/ufs/ufshcd-pltfrm.txt 
b/Documentation/devicetree/bindings/ufs/ufshcd-pltfrm.txt
index c39dfef..ac94220 100644
--- a/Documentation/devicetree/bindings/ufs/ufshcd-pltfrm.txt
+++ b/Documentation/devicetree/bindings/ufs/ufshcd-pltfrm.txt
@@ -41,6 +41,12 @@ Optional properties:
 -lanes-per-direction   : number of lanes available per direction - either 1 or 
2.
  Note that it is assume same number of lanes is used 
both
  directions at once. If not specified, default is 2 
lanes per direction.
+- dev-ref-clk-freq : Specify the device reference clock frequency, must be 
one of the following:
+ 0: 19.2 MHz
+ 1: 26 MHz
+ 2: 38.4 MHz
+ 3: 52 MHz
+ Defaults to 26 MHz if not specified.
 
 Note: If above properties are not defined it can be assumed that the supply
 regulators or clocks are always on.
@@ -66,4 +72,6 @@ Example:
freq-table-hz = <1 2>, <0 0>, <0 0>;
phys = <>;
phy-names = "ufsphy";
+   dev-ref-clk-freq = <0>; /* reference clock freq: 19.2 MHz */
};
+
diff --git a/drivers/scsi/ufs/ufs.h b/drivers/scsi/ufs/ufs.h
index 14e5bf7..e15deb0 100644
--- a/drivers/scsi/ufs/ufs.h
+++ b/drivers/scsi/ufs/ufs.h
@@ -378,6 +378,15 @@ enum query_opcode {
UPIU_QUERY_OPCODE_TOGGLE_FLAG   = 0x8,
 };
 
+/* bRefClkFreq attribute values */
+enum ref_clk_freq {
+   REF_CLK_FREQ_19_2_MHZ   = 0x0,
+   REF_CLK_FREQ_26_MHZ = 0x1,
+   REF_CLK_FREQ_38_4_MHZ   = 0x2,
+   REF_CLK_FREQ_52_MHZ = 0x3,
+   REF_CLK_FREQ_MAX= REF_CLK_FREQ_52_MHZ,
+};
+
 /* Query response result code */
 enum {
QUERY_RESULT_SUCCESS= 0x00,
diff --git a/drivers/scsi/ufs/ufshcd-pltfrm.c b/drivers/scsi/ufs/ufshcd-pltfrm.c
index e82bde0..b70838b 100644
--- a/drivers/scsi/ufs/ufshcd-pltfrm.c
+++ b/drivers/scsi/ufs/ufshcd-pltfrm.c
@@ -221,6 +221,24 @@ static int ufshcd_parse_regulator_info(struct ufs_hba *hba)
return err;
 }
 
+static void ufshcd_parse_dev_ref_clk_freq(struct ufs_hba *hba)
+{
+   struct device *dev = hba->dev;
+   struct device_node *np = dev->of_node;
+   int ret;
+
+   if (!np)
+   return;
+
+   ret = of_property_read_u32(np, "dev-ref-clk-freq",
+  >dev_ref_clk_freq);
+   if (ret ||
+   (hba->dev_ref_clk_freq < 0) ||
+   (hba->dev_ref_clk_freq > REF_CLK_FREQ_52_MHZ))
+   /* default setting */
+   hba->dev_ref_clk_freq = REF_CLK_FREQ_26_MHZ;
+}
+
 #ifdef CONFIG_PM
 /**
  * ufshcd_pltfrm_suspend - suspend power management function
@@ -343,6 +361,8 @@ int ufshcd_pltfrm_init(struct platform_device *pdev,
pm_runtime_set_active(>dev);
pm_runtime_enable(>dev);
 
+   ufshcd_parse_dev_ref_clk_freq(hba);
+
ufshcd_init_lanes_per_dir(hba);
 
err = ufshcd_init(hba, mmio_base, irq);
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index c5b1bf1..1ab882f 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -6297,6 +6297,61 @@ static void ufshcd_def_desc_sizes(struct ufs_hba *hba)
 }
 
 /**
+ * ufshcd_set_dev_ref_clk - set the device bRefClkFreq
+ * @hba: per-adapter instance
+ *
+ * Read the current value of the bRefClkFreq attribute from device and update 
it
+ * if host is supplying different reference clock frequency than one mentioned
+ * in bRefClkFreq attribute.
+ *
+ * Returns zero on success, non-zero error value on failure.
+ */
+static int ufshcd_set_dev_ref_clk(struct ufs_hba *hba)
+{
+   int err = 0;
+   int ref_clk = -1;
+   static const char * const ref_clk_freqs[] = {"19.2 MHz", "26 MHz",
+"38.4 MHz", "52 MHz"};
+
+   err = ufshcd_query_attr_retry(hba, UPIU_QUERY_OPCODE_READ_ATTR,
+   QUERY_ATTR_IDN_REF_CLK_FREQ, 0, 0, _clk);
+
+   if (err) {
+   

Re: [PATCH v3 4/4] rcu: Unlock non-start node only after accessing its gp_seq_needed

2018-05-21 Thread Paul E. McKenney
On Mon, May 21, 2018 at 05:28:23PM -0700, Paul E. McKenney wrote:
> On Mon, May 21, 2018 at 05:07:34PM -0700, Joel Fernandes wrote:
> > On Mon, May 21, 2018 at 04:25:38PM -0700, Paul E. McKenney wrote:
> > > On Sun, May 20, 2018 at 09:42:20PM -0700, Joel Fernandes wrote:
> > > > We acquire gp_seq_needed locklessly. To be safe, lets do the unlocking
> > > > after the access.
> > > 
> > > Actually, no, we hold rnp_start's ->lock throughout.  And this CPU (or in
> > > the case of no-CBs CPUs, this task) is in charge of rdp->gp_seq_needed,
> > > so nothing else is accessing it.  Or at least that is the intent.  ;-)
> > 
> > I was talking about protecting the internal node's rnp->gp_seq_needed, not
> > the rnp_start's gp_seq_needed.
> 
> Ah, good point, I missed the "if" condition.  This can be argued to work,
> sort of, given that we still hold the leaf rcu_node structure's lock,
> so that there is a limit to how far grace periods can advance.
> 
> But the code would of course be much cleaner with your change.
> 
> > We are protecting them in the loop:
> > 
> > like this:
> > for(...)
> > if (rnp != rnp_start)
> > raw_spin_lock_rcu_node(rnp);
> > [...]
> > // access rnp->gp_seq and rnp->gp_seq_needed
> > [...]
> > if (rnp != rnp_start)
> > raw_spin_unlock_rcu_node(rnp);
> > 
> > But we don't need to do such protection in unlock_out ? I'm sorry if I'm
> > missing something, but I'm wondering if rnp->gp_seq_needed of an internal
> > node can be accessed locklessly, then why can't that be done also in the
> > funnel locking loop - after all we are holding the rnp_start's lock through
> > out right?
> 
> I was focused on the updates, and missed the rnp->gp_seq_req access in the
> "if" statement.  The current code does sort of work, but only assuming
> that the compiler doesn't tear the load, and so your change would help.
> Could you please resend with your other two updated patches?  It depends
> on one of the earlier patches, so does not apply cleanly as-is.  I could
> hand-apply it, but that sounds like a good way to make your updated
> series fail to apply.  ;-)
> 
> But could you also make the commit log explicitly call out the "if"
> condition as being the offending access?

Never mind, me being stupid.  I need to apply this change to the original
commit "rcu: Make rcu_nocb_wait_gp() check if GP already requested", which
I have done with this attribution:

[ paulmck: Move lock release past "if" as suggested by Joel Fernandes. ]

I have rebased my stack on top of the updated commit.

Thanx, Paul



Re: [PATCH v3 4/4] rcu: Unlock non-start node only after accessing its gp_seq_needed

2018-05-21 Thread Paul E. McKenney
On Mon, May 21, 2018 at 05:28:23PM -0700, Paul E. McKenney wrote:
> On Mon, May 21, 2018 at 05:07:34PM -0700, Joel Fernandes wrote:
> > On Mon, May 21, 2018 at 04:25:38PM -0700, Paul E. McKenney wrote:
> > > On Sun, May 20, 2018 at 09:42:20PM -0700, Joel Fernandes wrote:
> > > > We acquire gp_seq_needed locklessly. To be safe, lets do the unlocking
> > > > after the access.
> > > 
> > > Actually, no, we hold rnp_start's ->lock throughout.  And this CPU (or in
> > > the case of no-CBs CPUs, this task) is in charge of rdp->gp_seq_needed,
> > > so nothing else is accessing it.  Or at least that is the intent.  ;-)
> > 
> > I was talking about protecting the internal node's rnp->gp_seq_needed, not
> > the rnp_start's gp_seq_needed.
> 
> Ah, good point, I missed the "if" condition.  This can be argued to work,
> sort of, given that we still hold the leaf rcu_node structure's lock,
> so that there is a limit to how far grace periods can advance.
> 
> But the code would of course be much cleaner with your change.
> 
> > We are protecting them in the loop:
> > 
> > like this:
> > for(...)
> > if (rnp != rnp_start)
> > raw_spin_lock_rcu_node(rnp);
> > [...]
> > // access rnp->gp_seq and rnp->gp_seq_needed
> > [...]
> > if (rnp != rnp_start)
> > raw_spin_unlock_rcu_node(rnp);
> > 
> > But we don't need to do such protection in unlock_out ? I'm sorry if I'm
> > missing something, but I'm wondering if rnp->gp_seq_needed of an internal
> > node can be accessed locklessly, then why can't that be done also in the
> > funnel locking loop - after all we are holding the rnp_start's lock through
> > out right?
> 
> I was focused on the updates, and missed the rnp->gp_seq_req access in the
> "if" statement.  The current code does sort of work, but only assuming
> that the compiler doesn't tear the load, and so your change would help.
> Could you please resend with your other two updated patches?  It depends
> on one of the earlier patches, so does not apply cleanly as-is.  I could
> hand-apply it, but that sounds like a good way to make your updated
> series fail to apply.  ;-)
> 
> But could you also make the commit log explicitly call out the "if"
> condition as being the offending access?

Never mind, me being stupid.  I need to apply this change to the original
commit "rcu: Make rcu_nocb_wait_gp() check if GP already requested", which
I have done with this attribution:

[ paulmck: Move lock release past "if" as suggested by Joel Fernandes. ]

I have rebased my stack on top of the updated commit.

Thanx, Paul



Re: [PATCH net] tuntap: raise EPOLLOUT on device up

2018-05-21 Thread Jason Wang



On 2018年05月22日 11:46, Michael S. Tsirkin wrote:

On Tue, May 22, 2018 at 11:22:11AM +0800, Jason Wang wrote:


On 2018年05月22日 06:08, Michael S. Tsirkin wrote:

On Mon, May 21, 2018 at 11:47:42AM -0400, David Miller wrote:

From: Jason Wang 
Date: Fri, 18 May 2018 21:00:43 +0800


We return -EIO on device down but can not raise EPOLLOUT after it was
up. This may confuse user like vhost which expects tuntap to raise
EPOLLOUT to re-enable its TX routine after tuntap is down. This could
be easily reproduced by transmitting packets from VM while down and up
the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.

Cc: Hannes Frederic Sowa 
Cc: Eric Dumazet 
Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
Signed-off-by: Jason Wang 

I'm no so sure what to do with this patch.

Like Michael says, this flag bit is only checks upon transmit which
may or may not happen after this point.  It doesn't seem to be
guaranteed.

The flag is checked in tun_chr_poll() as well.


Jason, can't we detect a link up transition and respond accordingly?
What do you think?


I think we've already tried to do this, in tun_net_open() we call
write_space(). But the problem is the bit may not be set at that time.

A second thought is to set the bit in tun_chr_poll() instead of -EIO like:

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d45ac37..46a1573 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1423,6 +1423,13 @@ static void tun_net_init(struct net_device *dev)
     dev->max_mtu = MAX_MTU - dev->hard_header_len;
  }

+static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file
*tfile)
+{
+   struct sock *sk = tfile->socket.sk;
+
+   return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
+}
+
  /* Character device part */

  /* Poll */
@@ -1445,10 +1452,9 @@ static __poll_t tun_chr_poll(struct file *file,
poll_table *wait)
     if (!ptr_ring_empty(>tx_ring))
     mask |= EPOLLIN | EPOLLRDNORM;

-   if (tun->dev->flags & IFF_UP &&
-   (sock_writeable(sk) ||
-    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, >sk_socket->flags)
&&
- sock_writeable(sk
+   if (tun_sock_writeable(tun, tfile) ||
+   (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, >sk_socket->flags)
&&
+    tun_sock_writeable(tun, tfile)));
     mask |= EPOLLOUT | EPOLLWRNORM;

     if (tun->dev->reg_state != NETREG_REGISTERED)

Does this make more sense?

Thanks

I just understood the motivation for doing it on EIO.
Maybe there's a reason it makes sense here as well,
but it's far from obvious. I suggest you repost adding
an explanation in the comment. The original patch will
be fine with an explanation as well.



Ok, let me add explanation on both code and commit log.

Thanks



Re: [PATCH net] tuntap: raise EPOLLOUT on device up

2018-05-21 Thread Jason Wang



On 2018年05月22日 11:46, Michael S. Tsirkin wrote:

On Tue, May 22, 2018 at 11:22:11AM +0800, Jason Wang wrote:


On 2018年05月22日 06:08, Michael S. Tsirkin wrote:

On Mon, May 21, 2018 at 11:47:42AM -0400, David Miller wrote:

From: Jason Wang 
Date: Fri, 18 May 2018 21:00:43 +0800


We return -EIO on device down but can not raise EPOLLOUT after it was
up. This may confuse user like vhost which expects tuntap to raise
EPOLLOUT to re-enable its TX routine after tuntap is down. This could
be easily reproduced by transmitting packets from VM while down and up
the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.

Cc: Hannes Frederic Sowa 
Cc: Eric Dumazet 
Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
Signed-off-by: Jason Wang 

I'm no so sure what to do with this patch.

Like Michael says, this flag bit is only checks upon transmit which
may or may not happen after this point.  It doesn't seem to be
guaranteed.

The flag is checked in tun_chr_poll() as well.


Jason, can't we detect a link up transition and respond accordingly?
What do you think?


I think we've already tried to do this, in tun_net_open() we call
write_space(). But the problem is the bit may not be set at that time.

A second thought is to set the bit in tun_chr_poll() instead of -EIO like:

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d45ac37..46a1573 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1423,6 +1423,13 @@ static void tun_net_init(struct net_device *dev)
     dev->max_mtu = MAX_MTU - dev->hard_header_len;
  }

+static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file
*tfile)
+{
+   struct sock *sk = tfile->socket.sk;
+
+   return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
+}
+
  /* Character device part */

  /* Poll */
@@ -1445,10 +1452,9 @@ static __poll_t tun_chr_poll(struct file *file,
poll_table *wait)
     if (!ptr_ring_empty(>tx_ring))
     mask |= EPOLLIN | EPOLLRDNORM;

-   if (tun->dev->flags & IFF_UP &&
-   (sock_writeable(sk) ||
-    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, >sk_socket->flags)
&&
- sock_writeable(sk
+   if (tun_sock_writeable(tun, tfile) ||
+   (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, >sk_socket->flags)
&&
+    tun_sock_writeable(tun, tfile)));
     mask |= EPOLLOUT | EPOLLWRNORM;

     if (tun->dev->reg_state != NETREG_REGISTERED)

Does this make more sense?

Thanks

I just understood the motivation for doing it on EIO.
Maybe there's a reason it makes sense here as well,
but it's far from obvious. I suggest you repost adding
an explanation in the comment. The original patch will
be fine with an explanation as well.



Ok, let me add explanation on both code and commit log.

Thanks



[PATCH v4 2/2] xen/PVH: Make GDT selectors PVH-specific

2018-05-21 Thread Boris Ostrovsky
We don't need to share PVH GDT layout with other GDTs, especially
since we now have a PVH-speciific entry (for stack canary segment).

Define PVH's own selectors.

(As a side effect of this change we are also fixing improper
reference to __KERNEL_CS)

Signed-off-by: Boris Ostrovsky 
---
 arch/x86/xen/xen-pvh.S | 23 +--
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
index 0169374..e7a5bf4 100644
--- a/arch/x86/xen/xen-pvh.S
+++ b/arch/x86/xen/xen-pvh.S
@@ -54,7 +54,11 @@
  * charge of setting up it's own stack, GDT and IDT.
  */
 
-#define PVH_GDT_ENTRY_CANARY   4
+#define PVH_GDT_ENTRY_CS   1
+#define PVH_GDT_ENTRY_DS   2
+#define PVH_GDT_ENTRY_CANARY   3
+#define PVH_CS_SEL (PVH_GDT_ENTRY_CS * 8)
+#define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8)
 #define PVH_CANARY_SEL (PVH_GDT_ENTRY_CANARY * 8)
 
 ENTRY(pvh_start_xen)
@@ -62,7 +66,7 @@ ENTRY(pvh_start_xen)
 
lgdt (_pa(gdt))
 
-   mov $(__BOOT_DS),%eax
+   mov $PVH_DS_SEL,%eax
mov %eax,%ds
mov %eax,%es
mov %eax,%ss
@@ -96,7 +100,7 @@ ENTRY(pvh_start_xen)
mov %eax, %cr0
 
/* Jump to 64-bit mode. */
-   ljmp $__KERNEL_CS, $_pa(1f)
+   ljmp $PVH_CS_SEL, $_pa(1f)
 
/* 64-bit entry point. */
.code64
@@ -136,13 +140,13 @@ ENTRY(pvh_start_xen)
or $(X86_CR0_PG | X86_CR0_PE), %eax
mov %eax, %cr0
 
-   ljmp $__BOOT_CS, $1f
+   ljmp $PVH_CS_SEL, $1f
 1:
call xen_prepare_pvh
mov $_pa(pvh_bootparams), %esi
 
/* startup_32 doesn't expect paging and PAE to be on. */
-   ljmp $__BOOT_CS, $_pa(2f)
+   ljmp $PVH_CS_SEL, $_pa(2f)
 2:
mov %cr0, %eax
and $~X86_CR0_PG, %eax
@@ -151,7 +155,7 @@ ENTRY(pvh_start_xen)
and $~X86_CR4_PAE, %eax
mov %eax, %cr4
 
-   ljmp $__BOOT_CS, $_pa(startup_32)
+   ljmp $PVH_CS_SEL, $_pa(startup_32)
 #endif
 END(pvh_start_xen)
 
@@ -163,13 +167,12 @@ gdt:
.word 0
 gdt_start:
.quad 0x/* NULL descriptor */
-   .quad 0x/* reserved */
 #ifdef CONFIG_X86_64
-   .quad GDT_ENTRY(0xa09a, 0, 0xf) /* __KERNEL_CS */
+   .quad GDT_ENTRY(0xa09a, 0, 0xf) /* PVH_CS_SEL */
 #else
-   .quad GDT_ENTRY(0xc09a, 0, 0xf) /* __KERNEL_CS */
+   .quad GDT_ENTRY(0xc09a, 0, 0xf) /* PVH_CS_SEL */
 #endif
-   .quad GDT_ENTRY(0xc092, 0, 0xf) /* __KERNEL_DS */
+   .quad GDT_ENTRY(0xc092, 0, 0xf) /* PVH_DS_SEL */
.quad GDT_ENTRY(0x4090, 0, 0x18)/* PVH_CANARY_SEL */
 gdt_end:
 
-- 
2.9.3



Re: KASAN: use-after-free Read in vhost_chr_write_iter

2018-05-21 Thread Michael S. Tsirkin
On Tue, May 22, 2018 at 11:50:29AM +0800, Jason Wang wrote:
> 
> 
> On 2018年05月21日 22:42, Michael S. Tsirkin wrote:
> > On Mon, May 21, 2018 at 10:38:10AM +0800, Jason Wang wrote:
> > > On 2018年05月18日 17:24, Jason Wang wrote:
> > > > On 2018年05月17日 21:45, DaeRyong Jeong wrote:
> > > > > We report the crash: KASAN: use-after-free Read in 
> > > > > vhost_chr_write_iter
> > > > > 
> > > > > This crash has been found in v4.17-rc1 using RaceFuzzer (a modified
> > > > > version of Syzkaller), which we describe more at the end of this
> > > > > report. Our analysis shows that the race occurs when invoking two
> > > > > syscalls concurrently, write$vnet and ioctl$VHOST_RESET_OWNER.
> > > > > 
> > > > > 
> > > > > Analysis:
> > > > > We think the concurrent execution of vhost_process_iotlb_msg() and
> > > > > vhost_dev_cleanup() causes the crash.
> > > > > Both of functions can run concurrently (please see call sequence 
> > > > > below),
> > > > > and possibly, there is a race on dev->iotlb.
> > > > > If the switch occurs right after vhost_dev_cleanup() frees
> > > > > dev->iotlb, vhost_process_iotlb_msg() still sees the non-null value
> > > > > and it
> > > > > keep executing without returning -EFAULT. Consequently, use-after-free
> > > > > occures
> > > > > 
> > > > > 
> > > > > Thread interleaving:
> > > > > CPU0 (vhost_process_iotlb_msg)    CPU1 (vhost_dev_cleanup)
> > > > > (In the case of both VHOST_IOTLB_UPDATE and
> > > > > VHOST_IOTLB_INVALIDATE)
> > > > > =    =
> > > > >      vhost_umem_clean(dev->iotlb);
> > > > > if (!dev->iotlb) {
> > > > >      ret = -EFAULT;
> > > > >      break;
> > > > > }
> > > > >      dev->iotlb = NULL;
> > > > > 
> > > > > 
> > > > > Call Sequence:
> > > > > CPU0
> > > > > =
> > > > > vhost_net_chr_write_iter
> > > > >  vhost_chr_write_iter
> > > > >      vhost_process_iotlb_msg
> > > > > 
> > > > > CPU1
> > > > > =
> > > > > vhost_net_ioctl
> > > > >  vhost_net_reset_owner
> > > > >      vhost_dev_reset_owner
> > > > >      vhost_dev_cleanup
> > > > Thanks a lot for the analysis.
> > > > 
> > > > This could be addressed by simply protect it with dev mutex.
> > > > 
> > > > Will post a patch.
> > > > 
> > > Could you please help to test the attached patch? I've done some smoking
> > > test.
> > > 
> > > Thanks
> > > >From 88328386f3f652e684ee33dc4cf63dcaed871aea Mon Sep 17 00:00:00 2001
> > > From: Jason Wang
> > > Date: Fri, 18 May 2018 17:33:27 +0800
> > > Subject: [PATCH] vhost: synchronize IOTLB message with dev cleanup
> > > 
> > > DaeRyong Jeong reports a race between vhost_dev_cleanup() and
> > > vhost_process_iotlb_msg():
> > > 
> > > Thread interleaving:
> > > CPU0 (vhost_process_iotlb_msg)CPU1 (vhost_dev_cleanup)
> > > (In the case of both VHOST_IOTLB_UPDATE and
> > > VHOST_IOTLB_INVALIDATE)
> > > = =
> > >   vhost_umem_clean(dev->iotlb);
> > > if (!dev->iotlb) {
> > >   ret = -EFAULT;
> > >   break;
> > > }
> > >   dev->iotlb = NULL;
> > > 
> > > The reason is we don't synchronize between them, fixing by protecting
> > > vhost_process_iotlb_msg() with dev mutex.
> > > 
> > > Reported-by: DaeRyong Jeong
> > > Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
> > > Reported-by: DaeRyong Jeong
> > Long terms we might want to move iotlb into vqs
> > so that messages can be processed in parallel.
> > Not sure how to do it yet.
> > 
> 
> Then we probably need to extend IOTLB msg to have a queue idx. But I thinkit
> was probably only help if we split tx/rx into separate processes.
> 
> Thanks

3 mutex locks on each access isn't pretty even if done by
a single process, but yes - might be more important for scsi.

-- 
MST


[PATCH v4 2/2] xen/PVH: Make GDT selectors PVH-specific

2018-05-21 Thread Boris Ostrovsky
We don't need to share PVH GDT layout with other GDTs, especially
since we now have a PVH-speciific entry (for stack canary segment).

Define PVH's own selectors.

(As a side effect of this change we are also fixing improper
reference to __KERNEL_CS)

Signed-off-by: Boris Ostrovsky 
---
 arch/x86/xen/xen-pvh.S | 23 +--
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
index 0169374..e7a5bf4 100644
--- a/arch/x86/xen/xen-pvh.S
+++ b/arch/x86/xen/xen-pvh.S
@@ -54,7 +54,11 @@
  * charge of setting up it's own stack, GDT and IDT.
  */
 
-#define PVH_GDT_ENTRY_CANARY   4
+#define PVH_GDT_ENTRY_CS   1
+#define PVH_GDT_ENTRY_DS   2
+#define PVH_GDT_ENTRY_CANARY   3
+#define PVH_CS_SEL (PVH_GDT_ENTRY_CS * 8)
+#define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8)
 #define PVH_CANARY_SEL (PVH_GDT_ENTRY_CANARY * 8)
 
 ENTRY(pvh_start_xen)
@@ -62,7 +66,7 @@ ENTRY(pvh_start_xen)
 
lgdt (_pa(gdt))
 
-   mov $(__BOOT_DS),%eax
+   mov $PVH_DS_SEL,%eax
mov %eax,%ds
mov %eax,%es
mov %eax,%ss
@@ -96,7 +100,7 @@ ENTRY(pvh_start_xen)
mov %eax, %cr0
 
/* Jump to 64-bit mode. */
-   ljmp $__KERNEL_CS, $_pa(1f)
+   ljmp $PVH_CS_SEL, $_pa(1f)
 
/* 64-bit entry point. */
.code64
@@ -136,13 +140,13 @@ ENTRY(pvh_start_xen)
or $(X86_CR0_PG | X86_CR0_PE), %eax
mov %eax, %cr0
 
-   ljmp $__BOOT_CS, $1f
+   ljmp $PVH_CS_SEL, $1f
 1:
call xen_prepare_pvh
mov $_pa(pvh_bootparams), %esi
 
/* startup_32 doesn't expect paging and PAE to be on. */
-   ljmp $__BOOT_CS, $_pa(2f)
+   ljmp $PVH_CS_SEL, $_pa(2f)
 2:
mov %cr0, %eax
and $~X86_CR0_PG, %eax
@@ -151,7 +155,7 @@ ENTRY(pvh_start_xen)
and $~X86_CR4_PAE, %eax
mov %eax, %cr4
 
-   ljmp $__BOOT_CS, $_pa(startup_32)
+   ljmp $PVH_CS_SEL, $_pa(startup_32)
 #endif
 END(pvh_start_xen)
 
@@ -163,13 +167,12 @@ gdt:
.word 0
 gdt_start:
.quad 0x/* NULL descriptor */
-   .quad 0x/* reserved */
 #ifdef CONFIG_X86_64
-   .quad GDT_ENTRY(0xa09a, 0, 0xf) /* __KERNEL_CS */
+   .quad GDT_ENTRY(0xa09a, 0, 0xf) /* PVH_CS_SEL */
 #else
-   .quad GDT_ENTRY(0xc09a, 0, 0xf) /* __KERNEL_CS */
+   .quad GDT_ENTRY(0xc09a, 0, 0xf) /* PVH_CS_SEL */
 #endif
-   .quad GDT_ENTRY(0xc092, 0, 0xf) /* __KERNEL_DS */
+   .quad GDT_ENTRY(0xc092, 0, 0xf) /* PVH_DS_SEL */
.quad GDT_ENTRY(0x4090, 0, 0x18)/* PVH_CANARY_SEL */
 gdt_end:
 
-- 
2.9.3



Re: KASAN: use-after-free Read in vhost_chr_write_iter

2018-05-21 Thread Michael S. Tsirkin
On Tue, May 22, 2018 at 11:50:29AM +0800, Jason Wang wrote:
> 
> 
> On 2018年05月21日 22:42, Michael S. Tsirkin wrote:
> > On Mon, May 21, 2018 at 10:38:10AM +0800, Jason Wang wrote:
> > > On 2018年05月18日 17:24, Jason Wang wrote:
> > > > On 2018年05月17日 21:45, DaeRyong Jeong wrote:
> > > > > We report the crash: KASAN: use-after-free Read in 
> > > > > vhost_chr_write_iter
> > > > > 
> > > > > This crash has been found in v4.17-rc1 using RaceFuzzer (a modified
> > > > > version of Syzkaller), which we describe more at the end of this
> > > > > report. Our analysis shows that the race occurs when invoking two
> > > > > syscalls concurrently, write$vnet and ioctl$VHOST_RESET_OWNER.
> > > > > 
> > > > > 
> > > > > Analysis:
> > > > > We think the concurrent execution of vhost_process_iotlb_msg() and
> > > > > vhost_dev_cleanup() causes the crash.
> > > > > Both of functions can run concurrently (please see call sequence 
> > > > > below),
> > > > > and possibly, there is a race on dev->iotlb.
> > > > > If the switch occurs right after vhost_dev_cleanup() frees
> > > > > dev->iotlb, vhost_process_iotlb_msg() still sees the non-null value
> > > > > and it
> > > > > keep executing without returning -EFAULT. Consequently, use-after-free
> > > > > occures
> > > > > 
> > > > > 
> > > > > Thread interleaving:
> > > > > CPU0 (vhost_process_iotlb_msg)    CPU1 (vhost_dev_cleanup)
> > > > > (In the case of both VHOST_IOTLB_UPDATE and
> > > > > VHOST_IOTLB_INVALIDATE)
> > > > > =    =
> > > > >      vhost_umem_clean(dev->iotlb);
> > > > > if (!dev->iotlb) {
> > > > >      ret = -EFAULT;
> > > > >      break;
> > > > > }
> > > > >      dev->iotlb = NULL;
> > > > > 
> > > > > 
> > > > > Call Sequence:
> > > > > CPU0
> > > > > =
> > > > > vhost_net_chr_write_iter
> > > > >  vhost_chr_write_iter
> > > > >      vhost_process_iotlb_msg
> > > > > 
> > > > > CPU1
> > > > > =
> > > > > vhost_net_ioctl
> > > > >  vhost_net_reset_owner
> > > > >      vhost_dev_reset_owner
> > > > >      vhost_dev_cleanup
> > > > Thanks a lot for the analysis.
> > > > 
> > > > This could be addressed by simply protect it with dev mutex.
> > > > 
> > > > Will post a patch.
> > > > 
> > > Could you please help to test the attached patch? I've done some smoking
> > > test.
> > > 
> > > Thanks
> > > >From 88328386f3f652e684ee33dc4cf63dcaed871aea Mon Sep 17 00:00:00 2001
> > > From: Jason Wang
> > > Date: Fri, 18 May 2018 17:33:27 +0800
> > > Subject: [PATCH] vhost: synchronize IOTLB message with dev cleanup
> > > 
> > > DaeRyong Jeong reports a race between vhost_dev_cleanup() and
> > > vhost_process_iotlb_msg():
> > > 
> > > Thread interleaving:
> > > CPU0 (vhost_process_iotlb_msg)CPU1 (vhost_dev_cleanup)
> > > (In the case of both VHOST_IOTLB_UPDATE and
> > > VHOST_IOTLB_INVALIDATE)
> > > = =
> > >   vhost_umem_clean(dev->iotlb);
> > > if (!dev->iotlb) {
> > >   ret = -EFAULT;
> > >   break;
> > > }
> > >   dev->iotlb = NULL;
> > > 
> > > The reason is we don't synchronize between them, fixing by protecting
> > > vhost_process_iotlb_msg() with dev mutex.
> > > 
> > > Reported-by: DaeRyong Jeong
> > > Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
> > > Reported-by: DaeRyong Jeong
> > Long terms we might want to move iotlb into vqs
> > so that messages can be processed in parallel.
> > Not sure how to do it yet.
> > 
> 
> Then we probably need to extend IOTLB msg to have a queue idx. But I thinkit
> was probably only help if we split tx/rx into separate processes.
> 
> Thanks

3 mutex locks on each access isn't pretty even if done by
a single process, but yes - might be more important for scsi.

-- 
MST


[PATCH v4 0/2] PVH GDT fixes

2018-05-21 Thread Boris Ostrovsky
Fix stack canary handling (in the first patch) and re-index PVH GDT to
make it explicit that the GDT PVH-specific

v4:
* Load %gs after base address is calculated
* Increase stack canary segment size to 48 bytes for long mode.

Boris Ostrovsky (2):
  xen/PVH: Set up GS segment for stack canary
  xen/PVH: Make GDT selectors PVH-specific

 arch/x86/xen/xen-pvh.S | 47 +--
 1 file changed, 37 insertions(+), 10 deletions(-)

-- 
2.9.3



[PATCH v4 1/2] xen/PVH: Set up GS segment for stack canary

2018-05-21 Thread Boris Ostrovsky
We are making calls to C code (e.g. xen_prepare_pvh()) which may use
stack canary (stored in GS segment).

Signed-off-by: Boris Ostrovsky 
---
 arch/x86/xen/xen-pvh.S | 26 +-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
index e1a5fbe..0169374 100644
--- a/arch/x86/xen/xen-pvh.S
+++ b/arch/x86/xen/xen-pvh.S
@@ -54,6 +54,9 @@
  * charge of setting up it's own stack, GDT and IDT.
  */
 
+#define PVH_GDT_ENTRY_CANARY   4
+#define PVH_CANARY_SEL (PVH_GDT_ENTRY_CANARY * 8)
+
 ENTRY(pvh_start_xen)
cld
 
@@ -98,6 +101,12 @@ ENTRY(pvh_start_xen)
/* 64-bit entry point. */
.code64
 1:
+   /* Set base address in stack canary descriptor. */
+   mov $MSR_GS_BASE,%ecx
+   mov $canary, %rax
+   cdq
+   wrmsr
+
call xen_prepare_pvh
 
/* startup_64 expects boot_params in %rsi. */
@@ -107,6 +116,17 @@ ENTRY(pvh_start_xen)
 
 #else /* CONFIG_X86_64 */
 
+   /* Set base address in stack canary descriptor. */
+   movl $_pa(gdt_start),%eax
+   movl $_pa(canary),%ecx
+   movw %cx, (PVH_GDT_ENTRY_CANARY * 8) + 2(%eax)
+   shrl $16, %ecx
+   movb %cl, (PVH_GDT_ENTRY_CANARY * 8) + 4(%eax)
+   movb %ch, (PVH_GDT_ENTRY_CANARY * 8) + 7(%eax)
+
+   mov $PVH_CANARY_SEL,%eax
+   mov %eax,%gs
+
call mk_early_pgtbl_32
 
mov $_pa(initial_page_table), %eax
@@ -150,9 +170,13 @@ gdt_start:
.quad GDT_ENTRY(0xc09a, 0, 0xf) /* __KERNEL_CS */
 #endif
.quad GDT_ENTRY(0xc092, 0, 0xf) /* __KERNEL_DS */
+   .quad GDT_ENTRY(0x4090, 0, 0x18)/* PVH_CANARY_SEL */
 gdt_end:
 
-   .balign 4
+   .balign 16
+canary:
+   .fill 48, 1, 0
+
 early_stack:
.fill 256, 1, 0
 early_stack_end:
-- 
2.9.3



[PATCH v4 1/2] xen/PVH: Set up GS segment for stack canary

2018-05-21 Thread Boris Ostrovsky
We are making calls to C code (e.g. xen_prepare_pvh()) which may use
stack canary (stored in GS segment).

Signed-off-by: Boris Ostrovsky 
---
 arch/x86/xen/xen-pvh.S | 26 +-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
index e1a5fbe..0169374 100644
--- a/arch/x86/xen/xen-pvh.S
+++ b/arch/x86/xen/xen-pvh.S
@@ -54,6 +54,9 @@
  * charge of setting up it's own stack, GDT and IDT.
  */
 
+#define PVH_GDT_ENTRY_CANARY   4
+#define PVH_CANARY_SEL (PVH_GDT_ENTRY_CANARY * 8)
+
 ENTRY(pvh_start_xen)
cld
 
@@ -98,6 +101,12 @@ ENTRY(pvh_start_xen)
/* 64-bit entry point. */
.code64
 1:
+   /* Set base address in stack canary descriptor. */
+   mov $MSR_GS_BASE,%ecx
+   mov $canary, %rax
+   cdq
+   wrmsr
+
call xen_prepare_pvh
 
/* startup_64 expects boot_params in %rsi. */
@@ -107,6 +116,17 @@ ENTRY(pvh_start_xen)
 
 #else /* CONFIG_X86_64 */
 
+   /* Set base address in stack canary descriptor. */
+   movl $_pa(gdt_start),%eax
+   movl $_pa(canary),%ecx
+   movw %cx, (PVH_GDT_ENTRY_CANARY * 8) + 2(%eax)
+   shrl $16, %ecx
+   movb %cl, (PVH_GDT_ENTRY_CANARY * 8) + 4(%eax)
+   movb %ch, (PVH_GDT_ENTRY_CANARY * 8) + 7(%eax)
+
+   mov $PVH_CANARY_SEL,%eax
+   mov %eax,%gs
+
call mk_early_pgtbl_32
 
mov $_pa(initial_page_table), %eax
@@ -150,9 +170,13 @@ gdt_start:
.quad GDT_ENTRY(0xc09a, 0, 0xf) /* __KERNEL_CS */
 #endif
.quad GDT_ENTRY(0xc092, 0, 0xf) /* __KERNEL_DS */
+   .quad GDT_ENTRY(0x4090, 0, 0x18)/* PVH_CANARY_SEL */
 gdt_end:
 
-   .balign 4
+   .balign 16
+canary:
+   .fill 48, 1, 0
+
 early_stack:
.fill 256, 1, 0
 early_stack_end:
-- 
2.9.3



[PATCH v4 0/2] PVH GDT fixes

2018-05-21 Thread Boris Ostrovsky
Fix stack canary handling (in the first patch) and re-index PVH GDT to
make it explicit that the GDT PVH-specific

v4:
* Load %gs after base address is calculated
* Increase stack canary segment size to 48 bytes for long mode.

Boris Ostrovsky (2):
  xen/PVH: Set up GS segment for stack canary
  xen/PVH: Make GDT selectors PVH-specific

 arch/x86/xen/xen-pvh.S | 47 +--
 1 file changed, 37 insertions(+), 10 deletions(-)

-- 
2.9.3



Re: KASAN: use-after-free Read in vhost_chr_write_iter

2018-05-21 Thread Jason Wang



On 2018年05月21日 22:42, Michael S. Tsirkin wrote:

On Mon, May 21, 2018 at 10:38:10AM +0800, Jason Wang wrote:

On 2018年05月18日 17:24, Jason Wang wrote:

On 2018年05月17日 21:45, DaeRyong Jeong wrote:

We report the crash: KASAN: use-after-free Read in vhost_chr_write_iter

This crash has been found in v4.17-rc1 using RaceFuzzer (a modified
version of Syzkaller), which we describe more at the end of this
report. Our analysis shows that the race occurs when invoking two
syscalls concurrently, write$vnet and ioctl$VHOST_RESET_OWNER.


Analysis:
We think the concurrent execution of vhost_process_iotlb_msg() and
vhost_dev_cleanup() causes the crash.
Both of functions can run concurrently (please see call sequence below),
and possibly, there is a race on dev->iotlb.
If the switch occurs right after vhost_dev_cleanup() frees
dev->iotlb, vhost_process_iotlb_msg() still sees the non-null value
and it
keep executing without returning -EFAULT. Consequently, use-after-free
occures


Thread interleaving:
CPU0 (vhost_process_iotlb_msg)    CPU1 (vhost_dev_cleanup)
(In the case of both VHOST_IOTLB_UPDATE and
VHOST_IOTLB_INVALIDATE)
=    =
     vhost_umem_clean(dev->iotlb);
if (!dev->iotlb) {
     ret = -EFAULT;
     break;
}
     dev->iotlb = NULL;


Call Sequence:
CPU0
=
vhost_net_chr_write_iter
 vhost_chr_write_iter
     vhost_process_iotlb_msg

CPU1
=
vhost_net_ioctl
 vhost_net_reset_owner
     vhost_dev_reset_owner
     vhost_dev_cleanup

Thanks a lot for the analysis.

This could be addressed by simply protect it with dev mutex.

Will post a patch.


Could you please help to test the attached patch? I've done some smoking
test.

Thanks
>From 88328386f3f652e684ee33dc4cf63dcaed871aea Mon Sep 17 00:00:00 2001
From: Jason Wang
Date: Fri, 18 May 2018 17:33:27 +0800
Subject: [PATCH] vhost: synchronize IOTLB message with dev cleanup

DaeRyong Jeong reports a race between vhost_dev_cleanup() and
vhost_process_iotlb_msg():

Thread interleaving:
CPU0 (vhost_process_iotlb_msg)  CPU1 (vhost_dev_cleanup)
(In the case of both VHOST_IOTLB_UPDATE and
VHOST_IOTLB_INVALIDATE)
=   =
vhost_umem_clean(dev->iotlb);
if (!dev->iotlb) {
ret = -EFAULT;
break;
}
dev->iotlb = NULL;

The reason is we don't synchronize between them, fixing by protecting
vhost_process_iotlb_msg() with dev mutex.

Reported-by: DaeRyong Jeong
Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
Reported-by: DaeRyong Jeong

Long terms we might want to move iotlb into vqs
so that messages can be processed in parallel.
Not sure how to do it yet.



Then we probably need to extend IOTLB msg to have a queue idx. But I 
thinkit was probably only help if we split tx/rx into separate processes.


Thanks




Re: KASAN: use-after-free Read in vhost_chr_write_iter

2018-05-21 Thread Jason Wang



On 2018年05月21日 22:42, Michael S. Tsirkin wrote:

On Mon, May 21, 2018 at 10:38:10AM +0800, Jason Wang wrote:

On 2018年05月18日 17:24, Jason Wang wrote:

On 2018年05月17日 21:45, DaeRyong Jeong wrote:

We report the crash: KASAN: use-after-free Read in vhost_chr_write_iter

This crash has been found in v4.17-rc1 using RaceFuzzer (a modified
version of Syzkaller), which we describe more at the end of this
report. Our analysis shows that the race occurs when invoking two
syscalls concurrently, write$vnet and ioctl$VHOST_RESET_OWNER.


Analysis:
We think the concurrent execution of vhost_process_iotlb_msg() and
vhost_dev_cleanup() causes the crash.
Both of functions can run concurrently (please see call sequence below),
and possibly, there is a race on dev->iotlb.
If the switch occurs right after vhost_dev_cleanup() frees
dev->iotlb, vhost_process_iotlb_msg() still sees the non-null value
and it
keep executing without returning -EFAULT. Consequently, use-after-free
occures


Thread interleaving:
CPU0 (vhost_process_iotlb_msg)    CPU1 (vhost_dev_cleanup)
(In the case of both VHOST_IOTLB_UPDATE and
VHOST_IOTLB_INVALIDATE)
=    =
     vhost_umem_clean(dev->iotlb);
if (!dev->iotlb) {
     ret = -EFAULT;
     break;
}
     dev->iotlb = NULL;


Call Sequence:
CPU0
=
vhost_net_chr_write_iter
 vhost_chr_write_iter
     vhost_process_iotlb_msg

CPU1
=
vhost_net_ioctl
 vhost_net_reset_owner
     vhost_dev_reset_owner
     vhost_dev_cleanup

Thanks a lot for the analysis.

This could be addressed by simply protect it with dev mutex.

Will post a patch.


Could you please help to test the attached patch? I've done some smoking
test.

Thanks
>From 88328386f3f652e684ee33dc4cf63dcaed871aea Mon Sep 17 00:00:00 2001
From: Jason Wang
Date: Fri, 18 May 2018 17:33:27 +0800
Subject: [PATCH] vhost: synchronize IOTLB message with dev cleanup

DaeRyong Jeong reports a race between vhost_dev_cleanup() and
vhost_process_iotlb_msg():

Thread interleaving:
CPU0 (vhost_process_iotlb_msg)  CPU1 (vhost_dev_cleanup)
(In the case of both VHOST_IOTLB_UPDATE and
VHOST_IOTLB_INVALIDATE)
=   =
vhost_umem_clean(dev->iotlb);
if (!dev->iotlb) {
ret = -EFAULT;
break;
}
dev->iotlb = NULL;

The reason is we don't synchronize between them, fixing by protecting
vhost_process_iotlb_msg() with dev mutex.

Reported-by: DaeRyong Jeong
Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
Reported-by: DaeRyong Jeong

Long terms we might want to move iotlb into vqs
so that messages can be processed in parallel.
Not sure how to do it yet.



Then we probably need to extend IOTLB msg to have a queue idx. But I 
thinkit was probably only help if we split tx/rx into separate processes.


Thanks




Re: [PATCH] [RFC] bcachefs: SIX locks (shared/intent/exclusive)

2018-05-21 Thread Kent Overstreet
On Mon, May 21, 2018 at 08:04:16PM -0700, Matthew Wilcox wrote:
> On Mon, May 21, 2018 at 10:19:51PM -0400, Kent Overstreet wrote:
> > New lock for bcachefs, like read/write locks but with a third state,
> > intent.
> > 
> > Intent locks conflict with each other, but not with read locks; taking a
> > write lock requires first holding an intent lock.
> 
> Can you put something in the description that these are sleeping locks
> (like mutexes), not spinning locks (like spinlocks)?  (Yeah, I know
> there's the opportunistic spin, but conceptually, they're sleeping locks).

Yup, I'll add that

> 
> Some other things I'd like documented:
> 
>  - Any number of readers can hold the lock
>  - Once one thread acquires the lock for intent, further intent acquisitions
>will block.  May new readers acquire the lock?

I think I should have that covered already - "Intent does not block read, but
does block other intent locks"

>  - You cannot acquire the lock for write directly, you must acquire it for
>intent first, then upgrade to write.
>  - Can you downgrade to read from intent, or downgrade from write back to
>intent?

You hold both write and intent, like so:

six_lock_intent(>lock);
six_lock_write(>lock);
six_unlock_write(>lock);
six_unlock_intent(>lock);


>  - Once you are trying to upgrade from intent to write, are new read
>acquisitions blocked? (can readers starve writers?)

Readers can starve writers in the current implementation, but that's something
that should probably be fixed...

>  - When you drop the lock as a writer, do we prefer reader acquisitions
>over intent acquisitions?  That is, if we have a queue of RRIRIRIR,
>and we drop the lock, does the queue look like II or IRIR?

Separate queues per lock type, so dropping a write lock will wake up everyone
trying to take a read lock, and dropping an intent lock wakes up everyone trying
to take an intent lock.

---

Here's the new documentation I just wrote:

/*
 * Shared/intent/exclusive locks: sleepable read/write locks, much like rw
 * semaphores, except with a third intermediate state, intent. Basic operations
 * are:
 *
 * six_lock_read(>lock);
 * six_unlock_read(>lock);
 *
 * six_lock_intent(>lock);
 * six_unlock_intent(>lock);
 *
 * six_lock_write(>lock);
 * six_unlock_write(>lock);
 *
 * Intent locks block other intent locks, but do not block read locks, and you
 * must have an intent lock held before taking a write lock, like so:
 *
 * six_lock_intent(>lock);
 * six_lock_write(>lock);
 * six_unlock_write(>lock);
 * six_unlock_intent(>lock);
 *
 * Other operations:
 *
 *   six_trylock_read()
 *   six_trylock_intent()
 *   six_trylock_write()
 *
 *   six_lock_downgrade():  convert from intent to read
 *   six_lock_tryupgrade(): attempt to convert from read to intent
 *
 * Locks also embed a sequence number, which is incremented when the lock is
 * locked or unlocked for write. The current sequence number can be grabbed
 * while a lock is held from lock->state.seq; then, if you drop the lock you can
 * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
 * iff it hasn't been locked for write in the meantime.
 *
 * There are also operations that take the lock type as a parameter, where the
 * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
 *
 *   six_lock_type(lock, type)
 *   six_unlock_type(lock, type)
 *   six_relock(lock, type, seq)
 *   six_trylock_type(lock, type)
 *   six_trylock_convert(lock, from, to)
 *
 * A lock may be held multiple types by the same thread (for read or intent,
 * not write) - up to SIX_LOCK_MAX_RECURSE. However, the six locks code does
 * _not_ implement the actual recursive checks itself though - rather, if your
 * code (e.g. btree iterator code) knows that the current thread already has a
 * lock held, and for the correct type, six_lock_increment() may be used to
 * bump up the counter for that type - the only effect is that one more call to
 * unlock will be required before the lock is unlocked.
 *
 */


Re: [PATCH] [RFC] bcachefs: SIX locks (shared/intent/exclusive)

2018-05-21 Thread Kent Overstreet
On Mon, May 21, 2018 at 08:04:16PM -0700, Matthew Wilcox wrote:
> On Mon, May 21, 2018 at 10:19:51PM -0400, Kent Overstreet wrote:
> > New lock for bcachefs, like read/write locks but with a third state,
> > intent.
> > 
> > Intent locks conflict with each other, but not with read locks; taking a
> > write lock requires first holding an intent lock.
> 
> Can you put something in the description that these are sleeping locks
> (like mutexes), not spinning locks (like spinlocks)?  (Yeah, I know
> there's the opportunistic spin, but conceptually, they're sleeping locks).

Yup, I'll add that

> 
> Some other things I'd like documented:
> 
>  - Any number of readers can hold the lock
>  - Once one thread acquires the lock for intent, further intent acquisitions
>will block.  May new readers acquire the lock?

I think I should have that covered already - "Intent does not block read, but
does block other intent locks"

>  - You cannot acquire the lock for write directly, you must acquire it for
>intent first, then upgrade to write.
>  - Can you downgrade to read from intent, or downgrade from write back to
>intent?

You hold both write and intent, like so:

six_lock_intent(>lock);
six_lock_write(>lock);
six_unlock_write(>lock);
six_unlock_intent(>lock);


>  - Once you are trying to upgrade from intent to write, are new read
>acquisitions blocked? (can readers starve writers?)

Readers can starve writers in the current implementation, but that's something
that should probably be fixed...

>  - When you drop the lock as a writer, do we prefer reader acquisitions
>over intent acquisitions?  That is, if we have a queue of RRIRIRIR,
>and we drop the lock, does the queue look like II or IRIR?

Separate queues per lock type, so dropping a write lock will wake up everyone
trying to take a read lock, and dropping an intent lock wakes up everyone trying
to take an intent lock.

---

Here's the new documentation I just wrote:

/*
 * Shared/intent/exclusive locks: sleepable read/write locks, much like rw
 * semaphores, except with a third intermediate state, intent. Basic operations
 * are:
 *
 * six_lock_read(>lock);
 * six_unlock_read(>lock);
 *
 * six_lock_intent(>lock);
 * six_unlock_intent(>lock);
 *
 * six_lock_write(>lock);
 * six_unlock_write(>lock);
 *
 * Intent locks block other intent locks, but do not block read locks, and you
 * must have an intent lock held before taking a write lock, like so:
 *
 * six_lock_intent(>lock);
 * six_lock_write(>lock);
 * six_unlock_write(>lock);
 * six_unlock_intent(>lock);
 *
 * Other operations:
 *
 *   six_trylock_read()
 *   six_trylock_intent()
 *   six_trylock_write()
 *
 *   six_lock_downgrade():  convert from intent to read
 *   six_lock_tryupgrade(): attempt to convert from read to intent
 *
 * Locks also embed a sequence number, which is incremented when the lock is
 * locked or unlocked for write. The current sequence number can be grabbed
 * while a lock is held from lock->state.seq; then, if you drop the lock you can
 * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
 * iff it hasn't been locked for write in the meantime.
 *
 * There are also operations that take the lock type as a parameter, where the
 * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
 *
 *   six_lock_type(lock, type)
 *   six_unlock_type(lock, type)
 *   six_relock(lock, type, seq)
 *   six_trylock_type(lock, type)
 *   six_trylock_convert(lock, from, to)
 *
 * A lock may be held multiple types by the same thread (for read or intent,
 * not write) - up to SIX_LOCK_MAX_RECURSE. However, the six locks code does
 * _not_ implement the actual recursive checks itself though - rather, if your
 * code (e.g. btree iterator code) knows that the current thread already has a
 * lock held, and for the correct type, six_lock_increment() may be used to
 * bump up the counter for that type - the only effect is that one more call to
 * unlock will be required before the lock is unlocked.
 *
 */


Re: [PATCH net] tuntap: raise EPOLLOUT on device up

2018-05-21 Thread Michael S. Tsirkin
On Tue, May 22, 2018 at 11:22:11AM +0800, Jason Wang wrote:
> 
> 
> On 2018年05月22日 06:08, Michael S. Tsirkin wrote:
> > On Mon, May 21, 2018 at 11:47:42AM -0400, David Miller wrote:
> > > From: Jason Wang 
> > > Date: Fri, 18 May 2018 21:00:43 +0800
> > > 
> > > > We return -EIO on device down but can not raise EPOLLOUT after it was
> > > > up. This may confuse user like vhost which expects tuntap to raise
> > > > EPOLLOUT to re-enable its TX routine after tuntap is down. This could
> > > > be easily reproduced by transmitting packets from VM while down and up
> > > > the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.
> > > > 
> > > > Cc: Hannes Frederic Sowa 
> > > > Cc: Eric Dumazet 
> > > > Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
> > > > Signed-off-by: Jason Wang 
> > > I'm no so sure what to do with this patch.
> > > 
> > > Like Michael says, this flag bit is only checks upon transmit which
> > > may or may not happen after this point.  It doesn't seem to be
> > > guaranteed.
> 
> The flag is checked in tun_chr_poll() as well.
> 
> > Jason, can't we detect a link up transition and respond accordingly?
> > What do you think?
> > 
> 
> I think we've already tried to do this, in tun_net_open() we call
> write_space(). But the problem is the bit may not be set at that time.
> 
> A second thought is to set the bit in tun_chr_poll() instead of -EIO like:
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index d45ac37..46a1573 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1423,6 +1423,13 @@ static void tun_net_init(struct net_device *dev)
>     dev->max_mtu = MAX_MTU - dev->hard_header_len;
>  }
> 
> +static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file
> *tfile)
> +{
> +   struct sock *sk = tfile->socket.sk;
> +
> +   return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
> +}
> +
>  /* Character device part */
> 
>  /* Poll */
> @@ -1445,10 +1452,9 @@ static __poll_t tun_chr_poll(struct file *file,
> poll_table *wait)
>     if (!ptr_ring_empty(>tx_ring))
>     mask |= EPOLLIN | EPOLLRDNORM;
> 
> -   if (tun->dev->flags & IFF_UP &&
> -   (sock_writeable(sk) ||
> -    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, >sk_socket->flags)
> &&
> - sock_writeable(sk
> +   if (tun_sock_writeable(tun, tfile) ||
> +   (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, >sk_socket->flags)
> &&
> +    tun_sock_writeable(tun, tfile)));
>     mask |= EPOLLOUT | EPOLLWRNORM;
> 
>     if (tun->dev->reg_state != NETREG_REGISTERED)
> 
> Does this make more sense?
> 
> Thanks

I just understood the motivation for doing it on EIO.
Maybe there's a reason it makes sense here as well,
but it's far from obvious. I suggest you repost adding
an explanation in the comment. The original patch will
be fine with an explanation as well.



Re: [PATCH net] tuntap: raise EPOLLOUT on device up

2018-05-21 Thread Michael S. Tsirkin
On Tue, May 22, 2018 at 11:22:11AM +0800, Jason Wang wrote:
> 
> 
> On 2018年05月22日 06:08, Michael S. Tsirkin wrote:
> > On Mon, May 21, 2018 at 11:47:42AM -0400, David Miller wrote:
> > > From: Jason Wang 
> > > Date: Fri, 18 May 2018 21:00:43 +0800
> > > 
> > > > We return -EIO on device down but can not raise EPOLLOUT after it was
> > > > up. This may confuse user like vhost which expects tuntap to raise
> > > > EPOLLOUT to re-enable its TX routine after tuntap is down. This could
> > > > be easily reproduced by transmitting packets from VM while down and up
> > > > the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.
> > > > 
> > > > Cc: Hannes Frederic Sowa 
> > > > Cc: Eric Dumazet 
> > > > Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
> > > > Signed-off-by: Jason Wang 
> > > I'm no so sure what to do with this patch.
> > > 
> > > Like Michael says, this flag bit is only checks upon transmit which
> > > may or may not happen after this point.  It doesn't seem to be
> > > guaranteed.
> 
> The flag is checked in tun_chr_poll() as well.
> 
> > Jason, can't we detect a link up transition and respond accordingly?
> > What do you think?
> > 
> 
> I think we've already tried to do this, in tun_net_open() we call
> write_space(). But the problem is the bit may not be set at that time.
> 
> A second thought is to set the bit in tun_chr_poll() instead of -EIO like:
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index d45ac37..46a1573 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1423,6 +1423,13 @@ static void tun_net_init(struct net_device *dev)
>     dev->max_mtu = MAX_MTU - dev->hard_header_len;
>  }
> 
> +static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file
> *tfile)
> +{
> +   struct sock *sk = tfile->socket.sk;
> +
> +   return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
> +}
> +
>  /* Character device part */
> 
>  /* Poll */
> @@ -1445,10 +1452,9 @@ static __poll_t tun_chr_poll(struct file *file,
> poll_table *wait)
>     if (!ptr_ring_empty(>tx_ring))
>     mask |= EPOLLIN | EPOLLRDNORM;
> 
> -   if (tun->dev->flags & IFF_UP &&
> -   (sock_writeable(sk) ||
> -    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, >sk_socket->flags)
> &&
> - sock_writeable(sk
> +   if (tun_sock_writeable(tun, tfile) ||
> +   (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, >sk_socket->flags)
> &&
> +    tun_sock_writeable(tun, tfile)));
>     mask |= EPOLLOUT | EPOLLWRNORM;
> 
>     if (tun->dev->reg_state != NETREG_REGISTERED)
> 
> Does this make more sense?
> 
> Thanks

I just understood the motivation for doing it on EIO.
Maybe there's a reason it makes sense here as well,
but it's far from obvious. I suggest you repost adding
an explanation in the comment. The original patch will
be fine with an explanation as well.



[PATCH net V2 2/4] virtio-net: correctly transmit XDP buff after linearizing

2018-05-21 Thread Jason Wang
We should not go for the error path after successfully transmitting a
XDP buffer after linearizing. Since the error path may try to pop and
drop next packet and increase the drop counters. Fixing this by simply
drop the refcnt of original page and go for xmit path.

Fixes: 72979a6c3590 ("virtio_net: xdp, add slowpath case for non contiguous 
buffers")
Cc: John Fastabend 
Acked-by: Michael S. Tsirkin 
Signed-off-by: Jason Wang 
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c15d240..6260d65 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -775,7 +775,7 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
}
*xdp_xmit = true;
if (unlikely(xdp_page != page))
-   goto err_xdp;
+   put_page(page);
rcu_read_unlock();
goto xdp_xmit;
case XDP_REDIRECT:
-- 
2.7.4



[PATCH] perf annotate: Show group event string for stdio

2018-05-21 Thread Jin Yao
When we enable the group, for tui/stdio2, the output first
line includes the group event string. While for stdio,
it will show only one event.

For example,

perf record -e cycles,branches ./div
perf annotate --group --stdio

Percent |  Source code & Disassembly of div for cycles (44407 samples)
..

The first line doesn't include the event 'branches'.

With this patch, it will show the correct group even string.

perf annotate --group --stdio

Percent |  Source code & Disassembly of div for cycles, branches (44407 
samples)
..

Signed-off-by: Jin Yao 
---
 tools/perf/util/annotate.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 6612c7f..7189768 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1965,6 +1965,7 @@ int symbol__annotate_printf(struct symbol *sym, struct 
map *map,
u64 len;
int width = symbol_conf.show_total_period ? 12 : 8;
int graph_dotted_len;
+   char buf[512];
 
filename = strdup(dso->long_name);
if (!filename)
@@ -1977,8 +1978,11 @@ int symbol__annotate_printf(struct symbol *sym, struct 
map *map,
 
len = symbol__size(sym);
 
-   if (perf_evsel__is_group_event(evsel))
+   if (perf_evsel__is_group_event(evsel)) {
width *= evsel->nr_members;
+   perf_evsel__group_desc(evsel, buf, sizeof(buf));
+   evsel_name = buf;
+   }
 
graph_dotted_len = printf(" %-*.*s| Source code & Disassembly of %s 
for %s (%" PRIu64 " samples)\n",
  width, width, symbol_conf.show_total_period ? 
"Period" :
-- 
2.7.4



[PATCH] perf annotate: Show group event string for stdio

2018-05-21 Thread Jin Yao
When we enable the group, for tui/stdio2, the output first
line includes the group event string. While for stdio,
it will show only one event.

For example,

perf record -e cycles,branches ./div
perf annotate --group --stdio

Percent |  Source code & Disassembly of div for cycles (44407 samples)
..

The first line doesn't include the event 'branches'.

With this patch, it will show the correct group even string.

perf annotate --group --stdio

Percent |  Source code & Disassembly of div for cycles, branches (44407 
samples)
..

Signed-off-by: Jin Yao 
---
 tools/perf/util/annotate.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 6612c7f..7189768 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1965,6 +1965,7 @@ int symbol__annotate_printf(struct symbol *sym, struct 
map *map,
u64 len;
int width = symbol_conf.show_total_period ? 12 : 8;
int graph_dotted_len;
+   char buf[512];
 
filename = strdup(dso->long_name);
if (!filename)
@@ -1977,8 +1978,11 @@ int symbol__annotate_printf(struct symbol *sym, struct 
map *map,
 
len = symbol__size(sym);
 
-   if (perf_evsel__is_group_event(evsel))
+   if (perf_evsel__is_group_event(evsel)) {
width *= evsel->nr_members;
+   perf_evsel__group_desc(evsel, buf, sizeof(buf));
+   evsel_name = buf;
+   }
 
graph_dotted_len = printf(" %-*.*s| Source code & Disassembly of %s 
for %s (%" PRIu64 " samples)\n",
  width, width, symbol_conf.show_total_period ? 
"Period" :
-- 
2.7.4



[PATCH net V2 2/4] virtio-net: correctly transmit XDP buff after linearizing

2018-05-21 Thread Jason Wang
We should not go for the error path after successfully transmitting a
XDP buffer after linearizing. Since the error path may try to pop and
drop next packet and increase the drop counters. Fixing this by simply
drop the refcnt of original page and go for xmit path.

Fixes: 72979a6c3590 ("virtio_net: xdp, add slowpath case for non contiguous 
buffers")
Cc: John Fastabend 
Acked-by: Michael S. Tsirkin 
Signed-off-by: Jason Wang 
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c15d240..6260d65 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -775,7 +775,7 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
}
*xdp_xmit = true;
if (unlikely(xdp_page != page))
-   goto err_xdp;
+   put_page(page);
rcu_read_unlock();
goto xdp_xmit;
case XDP_REDIRECT:
-- 
2.7.4



[PATCH net V2 4/4] virtio-net: fix leaking page for gso packet during mergeable XDP

2018-05-21 Thread Jason Wang
We need to drop refcnt to xdp_page if we see a gso packet. Otherwise
it will be leaked. Fixing this by moving the check of gso packet above
the linearizing logic. While at it, remove useless comment as well.

Cc: John Fastabend 
Fixes: 72979a6c3590 ("virtio_net: xdp, add slowpath case for non contiguous 
buffers")
Signed-off-by: Jason Wang 
---
 drivers/net/virtio_net.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 326e247..032e1ac 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -707,6 +707,13 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
void *data;
u32 act;
 
+   /* Transient failure which in theory could occur if
+* in-flight packets from before XDP was enabled reach
+* the receive path after XDP is loaded.
+*/
+   if (unlikely(hdr->hdr.gso_type))
+   goto err_xdp;
+
/* This happens when rx buffer size is underestimated
 * or headroom is not enough because of the buffer
 * was refilled before XDP is set. This should only
@@ -727,14 +734,6 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
xdp_page = page;
}
 
-   /* Transient failure which in theory could occur if
-* in-flight packets from before XDP was enabled reach
-* the receive path after XDP is loaded. In practice I
-* was not able to create this condition.
-*/
-   if (unlikely(hdr->hdr.gso_type))
-   goto err_xdp;
-
/* Allow consuming headroom but reserve enough space to push
 * the descriptor on if we get an XDP_TX return code.
 */
-- 
2.7.4



[PATCH net V2 4/4] virtio-net: fix leaking page for gso packet during mergeable XDP

2018-05-21 Thread Jason Wang
We need to drop refcnt to xdp_page if we see a gso packet. Otherwise
it will be leaked. Fixing this by moving the check of gso packet above
the linearizing logic. While at it, remove useless comment as well.

Cc: John Fastabend 
Fixes: 72979a6c3590 ("virtio_net: xdp, add slowpath case for non contiguous 
buffers")
Signed-off-by: Jason Wang 
---
 drivers/net/virtio_net.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 326e247..032e1ac 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -707,6 +707,13 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
void *data;
u32 act;
 
+   /* Transient failure which in theory could occur if
+* in-flight packets from before XDP was enabled reach
+* the receive path after XDP is loaded.
+*/
+   if (unlikely(hdr->hdr.gso_type))
+   goto err_xdp;
+
/* This happens when rx buffer size is underestimated
 * or headroom is not enough because of the buffer
 * was refilled before XDP is set. This should only
@@ -727,14 +734,6 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
xdp_page = page;
}
 
-   /* Transient failure which in theory could occur if
-* in-flight packets from before XDP was enabled reach
-* the receive path after XDP is loaded. In practice I
-* was not able to create this condition.
-*/
-   if (unlikely(hdr->hdr.gso_type))
-   goto err_xdp;
-
/* Allow consuming headroom but reserve enough space to push
 * the descriptor on if we get an XDP_TX return code.
 */
-- 
2.7.4



[PATCH net V2 3/4] virtio-net: correctly check num_buf during err path

2018-05-21 Thread Jason Wang
If we successfully linearize the packet, num_buf will be set to zero
which may confuse error handling path which assumes num_buf is at
least 1 and this can lead the code tries to pop the descriptor of next
buffer. Fixing this by checking num_buf against 1 before decreasing.

Fixes: 4941d472bf95 ("virtio-net: do not reset during XDP set")
Signed-off-by: Jason Wang 
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6260d65..326e247 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -875,7 +875,7 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
rcu_read_unlock();
 err_skb:
put_page(page);
-   while (--num_buf) {
+   while (num_buf-- > 1) {
buf = virtqueue_get_buf(rq->vq, );
if (unlikely(!buf)) {
pr_debug("%s: rx error: %d buffers missing\n",
-- 
2.7.4



[PATCH net V2 3/4] virtio-net: correctly check num_buf during err path

2018-05-21 Thread Jason Wang
If we successfully linearize the packet, num_buf will be set to zero
which may confuse error handling path which assumes num_buf is at
least 1 and this can lead the code tries to pop the descriptor of next
buffer. Fixing this by checking num_buf against 1 before decreasing.

Fixes: 4941d472bf95 ("virtio-net: do not reset during XDP set")
Signed-off-by: Jason Wang 
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6260d65..326e247 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -875,7 +875,7 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
rcu_read_unlock();
 err_skb:
put_page(page);
-   while (--num_buf) {
+   while (num_buf-- > 1) {
buf = virtqueue_get_buf(rq->vq, );
if (unlikely(!buf)) {
pr_debug("%s: rx error: %d buffers missing\n",
-- 
2.7.4



Re: [PATCH net] tuntap: raise EPOLLOUT on device up

2018-05-21 Thread Michael S. Tsirkin
On Tue, May 22, 2018 at 11:22:11AM +0800, Jason Wang wrote:
> 
> 
> On 2018年05月22日 06:08, Michael S. Tsirkin wrote:
> > On Mon, May 21, 2018 at 11:47:42AM -0400, David Miller wrote:
> > > From: Jason Wang 
> > > Date: Fri, 18 May 2018 21:00:43 +0800
> > > 
> > > > We return -EIO on device down but can not raise EPOLLOUT after it was
> > > > up. This may confuse user like vhost which expects tuntap to raise
> > > > EPOLLOUT to re-enable its TX routine after tuntap is down. This could
> > > > be easily reproduced by transmitting packets from VM while down and up
> > > > the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.
> > > > 
> > > > Cc: Hannes Frederic Sowa 
> > > > Cc: Eric Dumazet 
> > > > Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
> > > > Signed-off-by: Jason Wang 
> > > I'm no so sure what to do with this patch.
> > > 
> > > Like Michael says, this flag bit is only checks upon transmit which
> > > may or may not happen after this point.  It doesn't seem to be
> > > guaranteed.
> 
> The flag is checked in tun_chr_poll() as well.
> 
> > Jason, can't we detect a link up transition and respond accordingly?
> > What do you think?
> > 
> 
> I think we've already tried to do this, in tun_net_open() we call
> write_space(). But the problem is the bit may not be set at that time.

Which bit? __dev_change_flags seems to set IFF_UP before calling
ndo_open. The issue  I think is that tun_sock_write_space
exits if SOCKWQ_ASYNC_NOSPACE is clear.

And now I think I understand what is going on:

When link is down, writes to the device might fail with -EIO.
Userspace needs an indication when the status is resolved.  As a fix,
tun_net_open attempts to wake up writers - but that is only effective if
SOCKWQ_ASYNC_NOSPACE has been set in the past.  As a quick hack, set
SOCKWQ_ASYNC_NOSPACE when write fails because of the link down status.
If no writes failed, userspace does not know that interface
was down so should not care that it's going up.


does this describe what this line of code does?
If yes feel free to include this info in a code comment and commit log.



> A second thought is to set the bit in tun_chr_poll() instead of -EIO like:
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index d45ac37..46a1573 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1423,6 +1423,13 @@ static void tun_net_init(struct net_device *dev)
>     dev->max_mtu = MAX_MTU - dev->hard_header_len;
>  }
> 
> +static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file
> *tfile)
> +{
> +   struct sock *sk = tfile->socket.sk;
> +
> +   return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
> +}
> +
>  /* Character device part */
> 
>  /* Poll */
> @@ -1445,10 +1452,9 @@ static __poll_t tun_chr_poll(struct file *file,
> poll_table *wait)
>     if (!ptr_ring_empty(>tx_ring))
>     mask |= EPOLLIN | EPOLLRDNORM;
> 
> -   if (tun->dev->flags & IFF_UP &&
> -   (sock_writeable(sk) ||
> -    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, >sk_socket->flags)
> &&
> - sock_writeable(sk
> +   if (tun_sock_writeable(tun, tfile) ||
> +   (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, >sk_socket->flags)
> &&
> +    tun_sock_writeable(tun, tfile)));
>     mask |= EPOLLOUT | EPOLLWRNORM;
> 
>     if (tun->dev->reg_state != NETREG_REGISTERED)
> 
> Does this make more sense?
> 
> Thanks


Re: [PATCH net] tuntap: raise EPOLLOUT on device up

2018-05-21 Thread Michael S. Tsirkin
On Tue, May 22, 2018 at 11:22:11AM +0800, Jason Wang wrote:
> 
> 
> On 2018年05月22日 06:08, Michael S. Tsirkin wrote:
> > On Mon, May 21, 2018 at 11:47:42AM -0400, David Miller wrote:
> > > From: Jason Wang 
> > > Date: Fri, 18 May 2018 21:00:43 +0800
> > > 
> > > > We return -EIO on device down but can not raise EPOLLOUT after it was
> > > > up. This may confuse user like vhost which expects tuntap to raise
> > > > EPOLLOUT to re-enable its TX routine after tuntap is down. This could
> > > > be easily reproduced by transmitting packets from VM while down and up
> > > > the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.
> > > > 
> > > > Cc: Hannes Frederic Sowa 
> > > > Cc: Eric Dumazet 
> > > > Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
> > > > Signed-off-by: Jason Wang 
> > > I'm no so sure what to do with this patch.
> > > 
> > > Like Michael says, this flag bit is only checks upon transmit which
> > > may or may not happen after this point.  It doesn't seem to be
> > > guaranteed.
> 
> The flag is checked in tun_chr_poll() as well.
> 
> > Jason, can't we detect a link up transition and respond accordingly?
> > What do you think?
> > 
> 
> I think we've already tried to do this, in tun_net_open() we call
> write_space(). But the problem is the bit may not be set at that time.

Which bit? __dev_change_flags seems to set IFF_UP before calling
ndo_open. The issue  I think is that tun_sock_write_space
exits if SOCKWQ_ASYNC_NOSPACE is clear.

And now I think I understand what is going on:

When link is down, writes to the device might fail with -EIO.
Userspace needs an indication when the status is resolved.  As a fix,
tun_net_open attempts to wake up writers - but that is only effective if
SOCKWQ_ASYNC_NOSPACE has been set in the past.  As a quick hack, set
SOCKWQ_ASYNC_NOSPACE when write fails because of the link down status.
If no writes failed, userspace does not know that interface
was down so should not care that it's going up.


does this describe what this line of code does?
If yes feel free to include this info in a code comment and commit log.



> A second thought is to set the bit in tun_chr_poll() instead of -EIO like:
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index d45ac37..46a1573 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1423,6 +1423,13 @@ static void tun_net_init(struct net_device *dev)
>     dev->max_mtu = MAX_MTU - dev->hard_header_len;
>  }
> 
> +static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file
> *tfile)
> +{
> +   struct sock *sk = tfile->socket.sk;
> +
> +   return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
> +}
> +
>  /* Character device part */
> 
>  /* Poll */
> @@ -1445,10 +1452,9 @@ static __poll_t tun_chr_poll(struct file *file,
> poll_table *wait)
>     if (!ptr_ring_empty(>tx_ring))
>     mask |= EPOLLIN | EPOLLRDNORM;
> 
> -   if (tun->dev->flags & IFF_UP &&
> -   (sock_writeable(sk) ||
> -    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, >sk_socket->flags)
> &&
> - sock_writeable(sk
> +   if (tun_sock_writeable(tun, tfile) ||
> +   (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, >sk_socket->flags)
> &&
> +    tun_sock_writeable(tun, tfile)));
>     mask |= EPOLLOUT | EPOLLWRNORM;
> 
>     if (tun->dev->reg_state != NETREG_REGISTERED)
> 
> Does this make more sense?
> 
> Thanks


[PATCH net V2 1/4] virtio-net: correctly redirect linearized packet

2018-05-21 Thread Jason Wang
After a linearized packet was redirected by XDP, we should not go for
the err path which will try to pop buffers for the next packet and
increase the drop counter. Fixing this by just drop the page refcnt
for the original page.

Fixes: 186b3c998c50 ("virtio-net: support XDP_REDIRECT")
Reported-by: David Ahern 
Tested-by: David Ahern 
Acked-by: Michael S. Tsirkin 
Signed-off-by: Jason Wang 
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 770422e..c15d240 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -787,7 +787,7 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
}
*xdp_xmit = true;
if (unlikely(xdp_page != page))
-   goto err_xdp;
+   put_page(page);
rcu_read_unlock();
goto xdp_xmit;
default:
-- 
2.7.4



[PATCH net V2 0/4] Fix several issues of virtio-net mergeable XDP

2018-05-21 Thread Jason Wang
Hi:

Please review the patches that tries to fix sevreal issues of
virtio-net mergeable XDP.

Changes from V1:
- check against 1 before decreasing instead of resetting to 1
- typoe fixes

Jason Wang (4):
  virtio-net: correctly redirect linearized packet
  virtio-net: correctly transmit XDP buff after linearizing
  virtio-net: correctly check num_buf during err path
  virtio-net: fix leaking page for gso packet during mergeable XDP

 drivers/net/virtio_net.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

-- 
2.7.4



[PATCH net V2 0/4] Fix several issues of virtio-net mergeable XDP

2018-05-21 Thread Jason Wang
Hi:

Please review the patches that tries to fix sevreal issues of
virtio-net mergeable XDP.

Changes from V1:
- check against 1 before decreasing instead of resetting to 1
- typoe fixes

Jason Wang (4):
  virtio-net: correctly redirect linearized packet
  virtio-net: correctly transmit XDP buff after linearizing
  virtio-net: correctly check num_buf during err path
  virtio-net: fix leaking page for gso packet during mergeable XDP

 drivers/net/virtio_net.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

-- 
2.7.4



[PATCH net V2 1/4] virtio-net: correctly redirect linearized packet

2018-05-21 Thread Jason Wang
After a linearized packet was redirected by XDP, we should not go for
the err path which will try to pop buffers for the next packet and
increase the drop counter. Fixing this by just drop the page refcnt
for the original page.

Fixes: 186b3c998c50 ("virtio-net: support XDP_REDIRECT")
Reported-by: David Ahern 
Tested-by: David Ahern 
Acked-by: Michael S. Tsirkin 
Signed-off-by: Jason Wang 
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 770422e..c15d240 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -787,7 +787,7 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
}
*xdp_xmit = true;
if (unlikely(xdp_page != page))
-   goto err_xdp;
+   put_page(page);
rcu_read_unlock();
goto xdp_xmit;
default:
-- 
2.7.4



Re: [PATCH 12/33] clk: bcm2835: use match_string() helper

2018-05-21 Thread Yisheng Xie
Hi Andy,

On 2018/5/22 5:50, Andy Shevchenko wrote:
> On Mon, May 21, 2018 at 2:57 PM, Yisheng Xie  wrote:
>> match_string() returns the index of an array for a matching string,
>> which can be used intead of open coded variant.
>>
>> Cc: Michael Turquette 
>> Cc: Stephen Boyd 
>> Cc: Eric Anholt 
>> Cc: Stefan Wahren 
>> Cc: linux-...@vger.kernel.org
>> Cc: linux-rpi-ker...@lists.infradead.org
>> Cc: linux-arm-ker...@lists.infradead.org
>> Signed-off-by: Yisheng Xie 
> 
>> -   size_t i, j;
>> -   int ret;
>> +   int i, ret;
> 
> I do not see any need to change type for i.

Right, I just want to smaller the line of code, for unsinged int is also OK for 
i.
Anyway, I can change it as your suggestion in next version.

Thanks
Yisheng

> 
>> +   ret = match_string(cprman_parent_names,
>> +  ARRAY_SIZE(cprman_parent_names),
>> +  parents[i]);
>> +   if (ret >= 0)
>> +   parents[i] = cprman->real_parent_names[ret];
> 
> 



Re: [PATCH 12/33] clk: bcm2835: use match_string() helper

2018-05-21 Thread Yisheng Xie
Hi Andy,

On 2018/5/22 5:50, Andy Shevchenko wrote:
> On Mon, May 21, 2018 at 2:57 PM, Yisheng Xie  wrote:
>> match_string() returns the index of an array for a matching string,
>> which can be used intead of open coded variant.
>>
>> Cc: Michael Turquette 
>> Cc: Stephen Boyd 
>> Cc: Eric Anholt 
>> Cc: Stefan Wahren 
>> Cc: linux-...@vger.kernel.org
>> Cc: linux-rpi-ker...@lists.infradead.org
>> Cc: linux-arm-ker...@lists.infradead.org
>> Signed-off-by: Yisheng Xie 
> 
>> -   size_t i, j;
>> -   int ret;
>> +   int i, ret;
> 
> I do not see any need to change type for i.

Right, I just want to smaller the line of code, for unsinged int is also OK for 
i.
Anyway, I can change it as your suggestion in next version.

Thanks
Yisheng

> 
>> +   ret = match_string(cprman_parent_names,
>> +  ARRAY_SIZE(cprman_parent_names),
>> +  parents[i]);
>> +   if (ret >= 0)
>> +   parents[i] = cprman->real_parent_names[ret];
> 
> 



Re: Revert "dmaengine: pl330: add DMA_PAUSE feature"

2018-05-21 Thread Vinod Koul
On 21-05-18, 20:56, Frank Mori Hess wrote:
> On Mon, May 21, 2018 at 5:16 AM, Vinod Koul  wrote:
> >>
> >> I understand the residue can't be read after terminate, that's why
> >> reading the residue is step 2 in pause/residue/terminate.  My question
> >> was whether the entire sequence pause/residue/terminate taken as a
> >> whole can or cannot lose data.  Saying that individual steps can or
> >> can't lose data is not enough, context is required.  The key point is
> >> whether pause flushes in-flight data to its destination or not.  If it
> >> does, and our residue is accurate, the terminate cannot cause data
> >> loss.  If pause doesn't flush, an additional step of flush_sync as
> >> Lars suggested is required.  So pause/flush_sync/residue/terminate
> >> would be the safe sequence that cannot lose data.
> >
> > I wouldn't use cannot, shouldn't would be better here as it depends on HW 
> > and
> > where all data has been buffered and if it can be flushed or not.
> >
> > Have you checked if pl330 supports such flushing?
> 
> It does not, at least in the context of pausing.  The dma-330 DMAEND
> instruction flushes in-flight data to its destination, and there are
> read/write barrier instructions also, but none of them can be injected
> on the fly into a running dma thread.  DMAKILL can be, but it discards
> in-flight data.  Currently, if an 8250 serial driver uses the pl330
> for rx dma, the result is possible data loss/corruption.  If there was
> a stronger pause capability, call it "cmd_sync_pause" which guaranteed
> flushing of in-flight data to its destination and accurate residue
> reading when paused, then the 8250 serial driver could check for
> "cmd_sync_pause" and reject dma drivers that do not have that
> capability.  pl330.c would not advertise cmd_sync_pause.  I don't know
> if other dmaengine hardware would be able to support cmd_sync_pause or
> not, I'm mostly just familiar with the pl330.  The ep93xx_dma.c driver
> for example has a m2p_hw_synchronize function which seems to do a
> flush.

Well looks like even adding support for sync_pause doesn't solve your issue on
pl330. Do you want to move this to PIO mode then..?

-- 
~Vinod


Re: Revert "dmaengine: pl330: add DMA_PAUSE feature"

2018-05-21 Thread Vinod Koul
On 21-05-18, 20:56, Frank Mori Hess wrote:
> On Mon, May 21, 2018 at 5:16 AM, Vinod Koul  wrote:
> >>
> >> I understand the residue can't be read after terminate, that's why
> >> reading the residue is step 2 in pause/residue/terminate.  My question
> >> was whether the entire sequence pause/residue/terminate taken as a
> >> whole can or cannot lose data.  Saying that individual steps can or
> >> can't lose data is not enough, context is required.  The key point is
> >> whether pause flushes in-flight data to its destination or not.  If it
> >> does, and our residue is accurate, the terminate cannot cause data
> >> loss.  If pause doesn't flush, an additional step of flush_sync as
> >> Lars suggested is required.  So pause/flush_sync/residue/terminate
> >> would be the safe sequence that cannot lose data.
> >
> > I wouldn't use cannot, shouldn't would be better here as it depends on HW 
> > and
> > where all data has been buffered and if it can be flushed or not.
> >
> > Have you checked if pl330 supports such flushing?
> 
> It does not, at least in the context of pausing.  The dma-330 DMAEND
> instruction flushes in-flight data to its destination, and there are
> read/write barrier instructions also, but none of them can be injected
> on the fly into a running dma thread.  DMAKILL can be, but it discards
> in-flight data.  Currently, if an 8250 serial driver uses the pl330
> for rx dma, the result is possible data loss/corruption.  If there was
> a stronger pause capability, call it "cmd_sync_pause" which guaranteed
> flushing of in-flight data to its destination and accurate residue
> reading when paused, then the 8250 serial driver could check for
> "cmd_sync_pause" and reject dma drivers that do not have that
> capability.  pl330.c would not advertise cmd_sync_pause.  I don't know
> if other dmaengine hardware would be able to support cmd_sync_pause or
> not, I'm mostly just familiar with the pl330.  The ep93xx_dma.c driver
> for example has a m2p_hw_synchronize function which seems to do a
> flush.

Well looks like even adding support for sync_pause doesn't solve your issue on
pl330. Do you want to move this to PIO mode then..?

-- 
~Vinod


Re: [PATCH net 4/4] virito-net: fix leaking page for gso packet during mergeable XDP

2018-05-21 Thread Jason Wang



On 2018年05月21日 23:01, Michael S. Tsirkin wrote:

On Mon, May 21, 2018 at 04:35:06PM +0800, Jason Wang wrote:

We need to drop refcnt to xdp_page if we see a gso packet. Otherwise
it will be leaked. Fixing this by moving the check of gso packet above
the linearizing logic.

Cc: John Fastabend 
Fixes: 72979a6c3590 ("virtio_net: xdp, add slowpath case for non contiguous 
buffers")
Signed-off-by: Jason Wang 

typo in subject


Let me fix it in V2.


---
  drivers/net/virtio_net.c | 16 
  1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 165a922..f8db809 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -707,6 +707,14 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
void *data;
u32 act;
  
+		/* Transient failure which in theory could occur if

+* in-flight packets from before XDP was enabled reach
+* the receive path after XDP is loaded. In practice I
+* was not able to create this condition.

BTW we should probably drop the last sentence. It says in theory, should be 
enough.


Ok.

Thanks


+*/
+   if (unlikely(hdr->hdr.gso_type))
+   goto err_xdp;
+
/* This happens when rx buffer size is underestimated
 * or headroom is not enough because of the buffer
 * was refilled before XDP is set. This should only
@@ -728,14 +736,6 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
xdp_page = page;
}
  
-		/* Transient failure which in theory could occur if

-* in-flight packets from before XDP was enabled reach
-* the receive path after XDP is loaded. In practice I
-* was not able to create this condition.
-*/
-   if (unlikely(hdr->hdr.gso_type))
-   goto err_xdp;
-
/* Allow consuming headroom but reserve enough space to push
 * the descriptor on if we get an XDP_TX return code.
 */
--
2.7.4




Re: [PATCH net 4/4] virito-net: fix leaking page for gso packet during mergeable XDP

2018-05-21 Thread Jason Wang



On 2018年05月21日 23:01, Michael S. Tsirkin wrote:

On Mon, May 21, 2018 at 04:35:06PM +0800, Jason Wang wrote:

We need to drop refcnt to xdp_page if we see a gso packet. Otherwise
it will be leaked. Fixing this by moving the check of gso packet above
the linearizing logic.

Cc: John Fastabend 
Fixes: 72979a6c3590 ("virtio_net: xdp, add slowpath case for non contiguous 
buffers")
Signed-off-by: Jason Wang 

typo in subject


Let me fix it in V2.


---
  drivers/net/virtio_net.c | 16 
  1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 165a922..f8db809 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -707,6 +707,14 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
void *data;
u32 act;
  
+		/* Transient failure which in theory could occur if

+* in-flight packets from before XDP was enabled reach
+* the receive path after XDP is loaded. In practice I
+* was not able to create this condition.

BTW we should probably drop the last sentence. It says in theory, should be 
enough.


Ok.

Thanks


+*/
+   if (unlikely(hdr->hdr.gso_type))
+   goto err_xdp;
+
/* This happens when rx buffer size is underestimated
 * or headroom is not enough because of the buffer
 * was refilled before XDP is set. This should only
@@ -728,14 +736,6 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
xdp_page = page;
}
  
-		/* Transient failure which in theory could occur if

-* in-flight packets from before XDP was enabled reach
-* the receive path after XDP is loaded. In practice I
-* was not able to create this condition.
-*/
-   if (unlikely(hdr->hdr.gso_type))
-   goto err_xdp;
-
/* Allow consuming headroom but reserve enough space to push
 * the descriptor on if we get an XDP_TX return code.
 */
--
2.7.4




Re: [PATCH net 3/4] virtio-net: reset num_buf to 1 after linearizing packet

2018-05-21 Thread Jason Wang



On 2018年05月21日 22:59, Michael S. Tsirkin wrote:

On Mon, May 21, 2018 at 04:35:05PM +0800, Jason Wang wrote:

If we successfully linearize the packets, num_buf were set to zero
which was wrong since we now have only 1 buffer to be used for e.g in
the error path of receive_mergeable(). Zero num_buf will lead the code
try to pop the buffers of next packet and drop it. Fixing this by set
num_buf to 1 if we successfully linearize the packet.

Fixes: 4941d472bf95 ("virtio-net: do not reset during XDP set")
Signed-off-by: Jason Wang 
---
  drivers/net/virtio_net.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6260d65..165a922 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -722,6 +722,7 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
  );
if (!xdp_page)
goto err_xdp;
+   num_buf = 1;

So this is tweaked here for the benefit of err_skb below.
That's confusing and we won't remember to change it
if we change the error handling.

How about fixing the error path?


-while (--num_buf) {
+while (num_buf-- > 1) {

Seems more robust to me.


Looks good, will do this in V2.

Thanks




offset = VIRTIO_XDP_HEADROOM;
} else {
xdp_page = page;
--
2.7.4




Re: [PATCH net 3/4] virtio-net: reset num_buf to 1 after linearizing packet

2018-05-21 Thread Jason Wang



On 2018年05月21日 22:59, Michael S. Tsirkin wrote:

On Mon, May 21, 2018 at 04:35:05PM +0800, Jason Wang wrote:

If we successfully linearize the packets, num_buf were set to zero
which was wrong since we now have only 1 buffer to be used for e.g in
the error path of receive_mergeable(). Zero num_buf will lead the code
try to pop the buffers of next packet and drop it. Fixing this by set
num_buf to 1 if we successfully linearize the packet.

Fixes: 4941d472bf95 ("virtio-net: do not reset during XDP set")
Signed-off-by: Jason Wang 
---
  drivers/net/virtio_net.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6260d65..165a922 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -722,6 +722,7 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
  );
if (!xdp_page)
goto err_xdp;
+   num_buf = 1;

So this is tweaked here for the benefit of err_skb below.
That's confusing and we won't remember to change it
if we change the error handling.

How about fixing the error path?


-while (--num_buf) {
+while (num_buf-- > 1) {

Seems more robust to me.


Looks good, will do this in V2.

Thanks




offset = VIRTIO_XDP_HEADROOM;
} else {
xdp_page = page;
--
2.7.4




[PATCH] scripts/kconfig/confdata.c: Fix script/checkpatch.pl coding style issues.

2018-05-21 Thread user1
Hi.
I fixed the coding style issus in scripts/kconfig/confdata.c to not get
errors at least.

Signed-off-by:  user1 

---
  scripts/kconfig/confdata.c | 110 -
  1 file changed, 60 insertions(+), 50 deletions(-)

diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c
index df26c7b0fe13..d978ed1588e6 100644
--- a/scripts/kconfig/confdata.c
+++ b/scripts/kconfig/confdata.c
@@ -50,7 +50,7 @@ static void conf_default_message_callback(const char
*fmt, va_list ap)
  printf("\n#\n");
  }

-static void (*conf_message_callback) (const char *fmt, va_list ap) =
+static void (*conf_message_callback)(const char *fmt, va_list ap) =
  conf_default_message_callback;
  void conf_set_message_callback(void (*fn) (const char *fmt, va_list ap))
  {
@@ -176,7 +176,7 @@ static int conf_set_sym_val(struct symbol *sym, int
def, int def_flags, char *p)
  /* fall through */
  case S_INT:
  case S_HEX:
-done:
+done:
  if (sym_string_valid(sym, p)) {
  sym->def[def].val = xstrdup(p);
  sym->flags |= def_flags;
@@ -188,7 +188,7 @@ static int conf_set_sym_val(struct symbol *sym, int
def, int def_flags, char *p)
  }
  break;
  default:
-;
+break;
  }
  return 0;
  }
@@ -198,6 +198,7 @@ static int add_byte(int c, char **lineptr, size_t slen,
size_t *n)
  {
  char *nline;
  size_t new_size = slen + 1;
+
  if (new_size > *n) {
  new_size += LINE_GROWTH - 1;
  new_size *= 2;
@@ -343,7 +344,7 @@ int conf_read_simple(const char *name, int def)
  sym->flags |= def_flags;
  break;
  default:
-;
+break;
  }
  } else if (memcmp(line, CONFIG_, strlen(CONFIG_)) == 0) {
  p = strchr(line + strlen(CONFIG_), '=');
@@ -381,7 +382,9 @@ int conf_read_simple(const char *name, int def)
  }
  setsym:
  if (sym && sym_is_choice_value(sym)) {
-struct symbol *cs = prop_get_symbol(sym_get_choice_prop(sym));
+struct symbol *cs =
+prop_get_symbol(sym_get_choice_prop(sym));
+
  switch (sym->def[def].tri) {
  case no:
  break;
@@ -527,8 +530,7 @@ kconfig_print_comment(FILE *fp, const char *value, void
*arg)
  }
  }

-static struct conf_printer kconfig_printer_cb =
-{
+static struct conf_printer kconfig_printer_cb = {
  .print_symbol = kconfig_print_symbol,
  .print_comment = kconfig_print_comment,
  };
@@ -601,8 +603,7 @@ header_print_comment(FILE *fp, const char *value, void
*arg)
  fprintf(fp, " */\n");
  }

-static struct conf_printer header_printer_cb =
-{
+static struct conf_printer header_printer_cb = {
  .print_symbol = header_print_symbol,
  .print_comment = header_print_comment,
  };
@@ -613,15 +614,16 @@ static struct conf_printer header_printer_cb =
   * This printer is used when generating the `include/config/tristate.conf'
file.
   */
  static void
-tristate_print_symbol(FILE *fp, struct symbol *sym, const char *value,
void *arg)
+tristate_print_symbol(FILE *fp, struct symbol *sym,
+  const char *value, void *arg)
  {

  if (sym->type == S_TRISTATE && *value != 'n')
-fprintf(fp, "%s%s=%c\n", CONFIG_, sym->name,
(char)toupper(*value));
+fprintf(fp, "%s%s=%c\n",
+CONFIG_, sym->name, (char)toupper(*value));
  }

-static struct conf_printer tristate_printer_cb =
-{
+static struct conf_printer tristate_printer_cb = {
  .print_symbol = tristate_print_symbol,
  .print_comment = kconfig_print_comment,
  };
@@ -680,8 +682,7 @@ int conf_write_defconfig(const char *filename)
  /* Traverse all menus to find all relevant symbols */
  menu = rootmenu.list;

-while (menu != NULL)
-{
+while (menu != NULL) {
  sym = menu->sym;
  if (sym == NULL) {
  if (!menu_is_visible(menu))
@@ -695,7 +696,8 @@ int conf_write_defconfig(const char *filename)
  if (!sym_is_changable(sym))
  goto next_menu;
  /* If symbol equals to default value - skip */
-if (strcmp(sym_get_string_value(sym),
sym_get_string_default(sym)) == 0)
+if (strcmp(sym_get_string_value(sym),
+   sym_get_string_default(sym)) == 0)
  goto next_menu;

  /*
@@ -722,8 +724,7 @@ int conf_write_defconfig(const char *filename)
  next_menu:
  if (menu->list != NULL) {
  menu = menu->list;
-}
-else if (menu->next != NULL) {
+} else if (menu->next != NULL) {
  menu = menu->next;
  } else {
  while ((menu = menu->parent)) {
@@ -757,19 +758,24 @@ int conf_write(const char *name)
  strcpy(dirname, name);
  strcat(dirname, "/");
  basename = conf_get_configname();
-} else if 

[PATCH] scripts/kconfig/confdata.c: Fix script/checkpatch.pl coding style issues.

2018-05-21 Thread user1
Hi.
I fixed the coding style issus in scripts/kconfig/confdata.c to not get
errors at least.

Signed-off-by:  user1 

---
  scripts/kconfig/confdata.c | 110 -
  1 file changed, 60 insertions(+), 50 deletions(-)

diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c
index df26c7b0fe13..d978ed1588e6 100644
--- a/scripts/kconfig/confdata.c
+++ b/scripts/kconfig/confdata.c
@@ -50,7 +50,7 @@ static void conf_default_message_callback(const char
*fmt, va_list ap)
  printf("\n#\n");
  }

-static void (*conf_message_callback) (const char *fmt, va_list ap) =
+static void (*conf_message_callback)(const char *fmt, va_list ap) =
  conf_default_message_callback;
  void conf_set_message_callback(void (*fn) (const char *fmt, va_list ap))
  {
@@ -176,7 +176,7 @@ static int conf_set_sym_val(struct symbol *sym, int
def, int def_flags, char *p)
  /* fall through */
  case S_INT:
  case S_HEX:
-done:
+done:
  if (sym_string_valid(sym, p)) {
  sym->def[def].val = xstrdup(p);
  sym->flags |= def_flags;
@@ -188,7 +188,7 @@ static int conf_set_sym_val(struct symbol *sym, int
def, int def_flags, char *p)
  }
  break;
  default:
-;
+break;
  }
  return 0;
  }
@@ -198,6 +198,7 @@ static int add_byte(int c, char **lineptr, size_t slen,
size_t *n)
  {
  char *nline;
  size_t new_size = slen + 1;
+
  if (new_size > *n) {
  new_size += LINE_GROWTH - 1;
  new_size *= 2;
@@ -343,7 +344,7 @@ int conf_read_simple(const char *name, int def)
  sym->flags |= def_flags;
  break;
  default:
-;
+break;
  }
  } else if (memcmp(line, CONFIG_, strlen(CONFIG_)) == 0) {
  p = strchr(line + strlen(CONFIG_), '=');
@@ -381,7 +382,9 @@ int conf_read_simple(const char *name, int def)
  }
  setsym:
  if (sym && sym_is_choice_value(sym)) {
-struct symbol *cs = prop_get_symbol(sym_get_choice_prop(sym));
+struct symbol *cs =
+prop_get_symbol(sym_get_choice_prop(sym));
+
  switch (sym->def[def].tri) {
  case no:
  break;
@@ -527,8 +530,7 @@ kconfig_print_comment(FILE *fp, const char *value, void
*arg)
  }
  }

-static struct conf_printer kconfig_printer_cb =
-{
+static struct conf_printer kconfig_printer_cb = {
  .print_symbol = kconfig_print_symbol,
  .print_comment = kconfig_print_comment,
  };
@@ -601,8 +603,7 @@ header_print_comment(FILE *fp, const char *value, void
*arg)
  fprintf(fp, " */\n");
  }

-static struct conf_printer header_printer_cb =
-{
+static struct conf_printer header_printer_cb = {
  .print_symbol = header_print_symbol,
  .print_comment = header_print_comment,
  };
@@ -613,15 +614,16 @@ static struct conf_printer header_printer_cb =
   * This printer is used when generating the `include/config/tristate.conf'
file.
   */
  static void
-tristate_print_symbol(FILE *fp, struct symbol *sym, const char *value,
void *arg)
+tristate_print_symbol(FILE *fp, struct symbol *sym,
+  const char *value, void *arg)
  {

  if (sym->type == S_TRISTATE && *value != 'n')
-fprintf(fp, "%s%s=%c\n", CONFIG_, sym->name,
(char)toupper(*value));
+fprintf(fp, "%s%s=%c\n",
+CONFIG_, sym->name, (char)toupper(*value));
  }

-static struct conf_printer tristate_printer_cb =
-{
+static struct conf_printer tristate_printer_cb = {
  .print_symbol = tristate_print_symbol,
  .print_comment = kconfig_print_comment,
  };
@@ -680,8 +682,7 @@ int conf_write_defconfig(const char *filename)
  /* Traverse all menus to find all relevant symbols */
  menu = rootmenu.list;

-while (menu != NULL)
-{
+while (menu != NULL) {
  sym = menu->sym;
  if (sym == NULL) {
  if (!menu_is_visible(menu))
@@ -695,7 +696,8 @@ int conf_write_defconfig(const char *filename)
  if (!sym_is_changable(sym))
  goto next_menu;
  /* If symbol equals to default value - skip */
-if (strcmp(sym_get_string_value(sym),
sym_get_string_default(sym)) == 0)
+if (strcmp(sym_get_string_value(sym),
+   sym_get_string_default(sym)) == 0)
  goto next_menu;

  /*
@@ -722,8 +724,7 @@ int conf_write_defconfig(const char *filename)
  next_menu:
  if (menu->list != NULL) {
  menu = menu->list;
-}
-else if (menu->next != NULL) {
+} else if (menu->next != NULL) {
  menu = menu->next;
  } else {
  while ((menu = menu->parent)) {
@@ -757,19 +758,24 @@ int conf_write(const char *name)
  strcpy(dirname, name);
  strcat(dirname, "/");
  basename = conf_get_configname();
-} else if ((slash = strrchr(name, 

Re: [PATCH 07/33] iwlwifi: mvm: use match_string() helper

2018-05-21 Thread Yisheng Xie
Hi Andy,

On 2018/5/22 5:43, Andy Shevchenko wrote:
> On Mon, May 21, 2018 at 2:57 PM, Yisheng Xie  wrote:
>> match_string() returns the index of an array for a matching string,
>> which can be used intead of open coded variant.
> 
>> int ret, bt_force_ant_mode;
>>
>> -   for (bt_force_ant_mode = 0;
>> -bt_force_ant_mode < ARRAY_SIZE(modes_str);
>> -bt_force_ant_mode++) {
>> -   if (!strcmp(buf, modes_str[bt_force_ant_mode]))
>> -   break;
>> -   }
>> -
>> -   if (bt_force_ant_mode >= ARRAY_SIZE(modes_str))
> 
>> +   bt_force_ant_mode = match_string(modes_str,
>> +ARRAY_SIZE(modes_str), buf);
> 
> One line?

hmm, if use ret instead it will no over 80 chars.

> 
>> +   if (bt_force_ant_mode < 0)
>> return -EINVAL;
> 
> I would rather use
> 
> ret = match_string();
> if (ret < 0)
>  return ret;
> 
> bt_force_... = ret;
> 
> But it's up tu Loca.

OK, I will change it if Loca agree your opinion.

Thanks
Yisheng
> 
>>
>> ret = 0;
> 
> 
> 



Re: [PATCH 07/33] iwlwifi: mvm: use match_string() helper

2018-05-21 Thread Yisheng Xie
Hi Andy,

On 2018/5/22 5:43, Andy Shevchenko wrote:
> On Mon, May 21, 2018 at 2:57 PM, Yisheng Xie  wrote:
>> match_string() returns the index of an array for a matching string,
>> which can be used intead of open coded variant.
> 
>> int ret, bt_force_ant_mode;
>>
>> -   for (bt_force_ant_mode = 0;
>> -bt_force_ant_mode < ARRAY_SIZE(modes_str);
>> -bt_force_ant_mode++) {
>> -   if (!strcmp(buf, modes_str[bt_force_ant_mode]))
>> -   break;
>> -   }
>> -
>> -   if (bt_force_ant_mode >= ARRAY_SIZE(modes_str))
> 
>> +   bt_force_ant_mode = match_string(modes_str,
>> +ARRAY_SIZE(modes_str), buf);
> 
> One line?

hmm, if use ret instead it will no over 80 chars.

> 
>> +   if (bt_force_ant_mode < 0)
>> return -EINVAL;
> 
> I would rather use
> 
> ret = match_string();
> if (ret < 0)
>  return ret;
> 
> bt_force_... = ret;
> 
> But it's up tu Loca.

OK, I will change it if Loca agree your opinion.

Thanks
Yisheng
> 
>>
>> ret = 0;
> 
> 
> 



  1   2   3   4   5   6   7   8   9   10   >