from:"Matthew Wilcox"

[PATCH 1/2] Convert target drivers to use sbitmap

2018-05-15 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

The sbitmap and the percpu_ida perform essentially the same task,
allocating tags for commands.  Since the sbitmap is more used than
the percpu_ida, convert the percpu_ida users to the sbitmap API.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 drivers/scsi/qla2xxx/qla_target.c| 16 ++-
 drivers/target/iscsi/iscsi_target_util.c | 34 +---
 drivers/target/sbp/sbp_target.c  |  8 +++---
 drivers/target/target_core_transport.c   |  5 ++--
 drivers/target/tcm_fc/tfc_cmd.c  | 11 
 drivers/usb/gadget/function/f_tcm.c  |  8 +++---
 drivers/vhost/scsi.c |  9 ---
 drivers/xen/xen-scsiback.c   |  8 +++---
 include/target/iscsi/iscsi_target_core.h |  1 +
 include/target/target_core_base.h|  5 ++--
 10 files changed, 73 insertions(+), 32 deletions(-)

diff --git a/drivers/scsi/qla2xxx/qla_target.c 
b/drivers/scsi/qla2xxx/qla_target.c
index 025dc2d3f3de..cdf671c2af61 100644
--- a/drivers/scsi/qla2xxx/qla_target.c
+++ b/drivers/scsi/qla2xxx/qla_target.c
@@ -3719,7 +3719,8 @@ void qlt_free_cmd(struct qla_tgt_cmd *cmd)
return;
}
cmd->jiffies_at_free = get_jiffies_64();
-   percpu_ida_free(>se_sess->sess_tag_pool, cmd->se_cmd.map_tag);
+   sbitmap_queue_clear(>se_sess->sess_tag_pool, cmd->se_cmd.map_tag,
+   cmd->se_cmd.map_cpu);
 }
 EXPORT_SYMBOL(qlt_free_cmd);
 
@@ -4084,7 +4085,8 @@ static void __qlt_do_work(struct qla_tgt_cmd *cmd)
qlt_send_term_exchange(qpair, NULL, >atio, 1, 0);
 
qlt_decr_num_pend_cmds(vha);
-   percpu_ida_free(>se_sess->sess_tag_pool, cmd->se_cmd.map_tag);
+   sbitmap_queue_clear(>se_sess->sess_tag_pool, cmd->se_cmd.map_tag,
+   cmd->se_cmd.map_cpu);
spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
 
spin_lock_irqsave(>tgt.sess_lock, flags);
@@ -4215,9 +4217,9 @@ static struct qla_tgt_cmd *qlt_get_tag(scsi_qla_host_t 
*vha,
 {
struct se_session *se_sess = sess->se_sess;
struct qla_tgt_cmd *cmd;
-   int tag;
+   int tag, cpu;
 
-   tag = percpu_ida_alloc(_sess->sess_tag_pool, TASK_RUNNING);
+   tag = sbitmap_queue_get(_sess->sess_tag_pool, );
if (tag < 0)
return NULL;
 
@@ -4230,6 +4232,7 @@ static struct qla_tgt_cmd *qlt_get_tag(scsi_qla_host_t 
*vha,
qlt_incr_num_pend_cmds(vha);
cmd->vha = vha;
cmd->se_cmd.map_tag = tag;
+   cmd->se_cmd.map_cpu = cpu;
cmd->sess = sess;
cmd->loop_id = sess->loop_id;
cmd->conf_compl_supported = sess->conf_compl_supported;
@@ -5212,7 +5215,7 @@ qlt_alloc_qfull_cmd(struct scsi_qla_host *vha,
struct fc_port *sess;
struct se_session *se_sess;
struct qla_tgt_cmd *cmd;
-   int tag;
+   int tag, cpu;
unsigned long flags;
 
if (unlikely(tgt->tgt_stop)) {
@@ -5244,7 +5247,7 @@ qlt_alloc_qfull_cmd(struct scsi_qla_host *vha,
 
se_sess = sess->se_sess;
 
-   tag = percpu_ida_alloc(_sess->sess_tag_pool, TASK_RUNNING);
+   tag = sbitmap_queue_get(_sess->sess_tag_pool, );
if (tag < 0)
return;
 
@@ -5275,6 +5278,7 @@ qlt_alloc_qfull_cmd(struct scsi_qla_host *vha,
cmd->reset_count = ha->base_qpair->chip_reset;
cmd->q_full = 1;
cmd->qpair = ha->base_qpair;
+   cmd->se_cmd.map_cpu = cpu;
 
if (qfull) {
cmd->q_full = 1;
diff --git a/drivers/target/iscsi/iscsi_target_util.c 
b/drivers/target/iscsi/iscsi_target_util.c
index 4435bf374d2d..28bcffae609f 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -17,7 +17,7 @@
  
**/
 
 #include 
-#include 
+#include 
 #include  /* ipv6_addr_equal() */
 #include 
 #include 
@@ -147,6 +147,28 @@ void iscsit_free_r2ts_from_list(struct iscsi_cmd *cmd)
spin_unlock_bh(>r2t_lock);
 }
 
+int iscsit_wait_for_tag(struct se_session *se_sess, int state, int *cpup)
+{
+   int tag = -1;
+   DEFINE_WAIT(wait);
+   struct sbq_wait_state *ws;
+
+   if (state == TASK_RUNNING)
+   return tag;
+
+   ws = _sess->sess_tag_pool.ws[0];
+   for (;;) {
+   prepare_to_wait_exclusive(>wait, , state);
+   if (signal_pending_state(state, current))
+   break;
+   schedule();
+   tag = sbitmap_queue_get(_sess->sess_tag_pool, cpup);
+   }
+
+   finish_wait(>wait, );
+   return tag;
+}
+
 /*
  * May be called from software interrupt (timer) context for allocating
  * iSCSI NopINs.
@@ -155,9 +177,11 @@ struct iscsi_cmd *iscsit_allocate_cmd(struct

[PATCH 0/2] Use sbitmap instead of percpu_ida

2018-05-15 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

This is a pretty rough-and-ready conversion of the target drivers
from using percpu_ida to sbitmap.  It compiles; I don't have a target
setup, so it's completely untested.  I haven't tried to do anything
particularly clever here, so it's possible that, for example, the wait
queue in iscsi_target_util could be more clever, like the block layer
uses multiple wait queues to avoid pingpongs.  Or maybe we could figure
out a way to not store the CPU that the ID was allocated on, or perhaps
the options I specified to sbitmap_queue_init() are suboptimal.

Patch 2 isn't interesting; it just deletes the implementation.  Patch 1
will be where all the action is.

Matthew Wilcox (2):
  Convert target drivers to use sbitmap
  Remove percpu_ida

 drivers/scsi/qla2xxx/qla_target.c|  16 +-
 drivers/target/iscsi/iscsi_target_util.c |  34 +-
 drivers/target/sbp/sbp_target.c  |   8 +-
 drivers/target/target_core_transport.c   |   5 +-
 drivers/target/tcm_fc/tfc_cmd.c  |  11 +-
 drivers/usb/gadget/function/f_tcm.c  |   8 +-
 drivers/vhost/scsi.c |   9 +-
 drivers/xen/xen-scsiback.c   |   8 +-
 include/linux/percpu_ida.h   |  83 -
 include/target/iscsi/iscsi_target_core.h |   1 +
 include/target/target_core_base.h|   5 +-
 lib/Makefile |   2 +-
 lib/percpu_ida.c | 391 ---
 13 files changed, 74 insertions(+), 507 deletions(-)
 delete mode 100644 include/linux/percpu_ida.h
 delete mode 100644 lib/percpu_ida.c

-- 
2.17.0

[PATCH 2/2] Remove percpu_ida

2018-05-15 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

With its one user gone, remove the library code.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/percpu_ida.h |  83 
 lib/Makefile   |   2 +-
 lib/percpu_ida.c   | 391 -
 3 files changed, 1 insertion(+), 475 deletions(-)
 delete mode 100644 include/linux/percpu_ida.h
 delete mode 100644 lib/percpu_ida.c

diff --git a/include/linux/percpu_ida.h b/include/linux/percpu_ida.h
deleted file mode 100644
index 07d78e4653bc..
--- a/include/linux/percpu_ida.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __PERCPU_IDA_H__
-#define __PERCPU_IDA_H__
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-struct percpu_ida_cpu;
-
-struct percpu_ida {
-   /*
-* number of tags available to be allocated, as passed to
-* percpu_ida_init()
-*/
-   unsignednr_tags;
-   unsignedpercpu_max_size;
-   unsignedpercpu_batch_size;
-
-   struct percpu_ida_cpu __percpu  *tag_cpu;
-
-   /*
-* Bitmap of cpus that (may) have tags on their percpu freelists:
-* steal_tags() uses this to decide when to steal tags, and which cpus
-* to try stealing from.
-*
-* It's ok for a freelist to be empty when its bit is set - steal_tags()
-* will just keep looking - but the bitmap _must_ be set whenever a
-* percpu freelist does have tags.
-*/
-   cpumask_t   cpus_have_tags;
-
-   struct {
-   spinlock_t  lock;
-   /*
-* When we go to steal tags from another cpu (see steal_tags()),
-* we want to pick a cpu at random. Cycling through them every
-* time we steal is a bit easier and more or less equivalent:
-*/
-   unsignedcpu_last_stolen;
-
-   /* For sleeping on allocation failure */
-   wait_queue_head_t   wait;
-
-   /*
-* Global freelist - it's a stack where nr_free points to the
-* top
-*/
-   unsignednr_free;
-   unsigned*freelist;
-   } cacheline_aligned_in_smp;
-};
-
-/*
- * Number of tags we move between the percpu freelist and the global freelist 
at
- * a time
- */
-#define IDA_DEFAULT_PCPU_BATCH_MOVE32U
-/* Max size of percpu freelist, */
-#define IDA_DEFAULT_PCPU_SIZE  ((IDA_DEFAULT_PCPU_BATCH_MOVE * 3) / 2)
-
-int percpu_ida_alloc(struct percpu_ida *pool, int state);
-void percpu_ida_free(struct percpu_ida *pool, unsigned tag);
-
-void percpu_ida_destroy(struct percpu_ida *pool);
-int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
-   unsigned long max_size, unsigned long batch_size);
-static inline int percpu_ida_init(struct percpu_ida *pool, unsigned long 
nr_tags)
-{
-   return __percpu_ida_init(pool, nr_tags, IDA_DEFAULT_PCPU_SIZE,
-   IDA_DEFAULT_PCPU_BATCH_MOVE);
-}
-
-typedef int (*percpu_ida_cb)(unsigned, void *);
-int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
-   void *data);
-
-unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu);
-#endif /* __PERCPU_IDA_H__ */
diff --git a/lib/Makefile b/lib/Makefile
index ce20696d5a92..7626dece1d27 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -39,7 +39,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o 
random32.o \
 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
-percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
+percpu-refcount.o rhashtable.o reciprocal_div.o \
 once.o refcount.o usercopy.o errseq.o bucket_locks.o
 obj-$(CONFIG_STRING_SELFTEST) += test_string.o
 obj-y += string_helpers.o
diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
deleted file mode 100644
index 6016f1deb1f5..
--- a/lib/percpu_ida.c
+++ /dev/null
@@ -1,391 +0,0 @@
-/*
- * Percpu IDA library
- *
- * Copyright (C) 2013 Datera, Inc. Kent Overstreet
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#includ

Re: [PATCH net-next 1/2] tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive

2018-04-25 Thread Matthew Wilcox

On Wed, Apr 25, 2018 at 09:20:55AM -0700, Eric Dumazet wrote:
> On 04/25/2018 09:04 AM, Matthew Wilcox wrote:
> > If you don't zap the page range, any of the CPUs in the system where
> > any thread in this task have ever run may have a TLB entry pointing to
> > this page ... if the page is being recycled into the page allocator,
> > then that page might end up as a slab page or page table or page cache
> > while the other CPU still have access to it.
> 
> Yes, this makes sense.
> 
> > 
> > You could hang onto the page until you've built up a sufficiently large
> > batch, then bulk-invalidate all of the TLB entries, but we start to get
> > into weirdnesses on different CPU architectures.
> > 
> 
> zap_page_range() is already doing a bulk-invalidate,
> so maybe vm_replace_page() wont bring serious improvement if we end-up doing 
> same dance.

Sorry, I was unclear.  zap_page_range() bulk-invalidates all pages that
were torn down as part of this call.  What I was trying to say was that
we could have a whole new API which put page after page into the same
address, and bumped the refcount on them to prevent them from actually
being freed.  Once we get to a batch limit, we invalidate all of the
pages which were mapped at those addresses and can then free the pages
back to the allocator.

I don't think you can implement this scheme on s390 because it requires
the userspace address to still be mapped to that page on shootdown
(?) but I think we could implement it on x86.

Another possibility is if we had some way to insert the TLB entry into
the local CPU's page tables only, we wouldn't need to broadcast-invalidate
the TLB entry; we could just do it locally which is relatively quick.

Re: [PATCH net-next 1/2] tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive

2018-04-25 Thread Matthew Wilcox

On Wed, Apr 25, 2018 at 06:01:02AM -0700, Eric Dumazet wrote:
> On 04/24/2018 11:28 PM, Christoph Hellwig wrote:
> > On Tue, Apr 24, 2018 at 10:27:21PM -0700, Eric Dumazet wrote:
> >> When adding tcp mmap() implementation, I forgot that socket lock
> >> had to be taken before current->mm->mmap_sem. syzbot eventually caught
> >> the bug.
> >>
> >> Since we can not lock the socket in tcp mmap() handler we have to
> >> split the operation in two phases.
> >>
> >> 1) mmap() on a tcp socket simply reserves VMA space, and nothing else.
> >>   This operation does not involve any TCP locking.
> >>
> >> 2) setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) implements
> >>  the transfert of pages from skbs to one VMA.
> >>   This operation only uses down_read(>mm->mmap_sem) after
> >>   holding TCP lock, thus solving the lockdep issue.
> >>
> >> This new implementation was suggested by Andy Lutomirski with great 
> >> details.
> > 
> > Thanks, this looks much more sensible to me.
> > 
> 
> Thanks Christoph
> 
> Note the high cost of zap_page_range(), needed to avoid -EBUSY being returned
> from vm_insert_page() the second time TCP_ZEROCOPY_RECEIVE is used on one VMA.
> 
> Ideally a vm_replace_page() would avoid this cost ?

If you don't zap the page range, any of the CPUs in the system where
any thread in this task have ever run may have a TLB entry pointing to
this page ... if the page is being recycled into the page allocator,
then that page might end up as a slab page or page table or page cache
while the other CPU still have access to it.

You could hang onto the page until you've built up a sufficiently large
batch, then bulk-invalidate all of the TLB entries, but we start to get
into weirdnesses on different CPU architectures.

Re: [PATCH v3] kvmalloc: always use vmalloc if CONFIG_DEBUG_SG

2018-04-24 Thread Matthew Wilcox

On Tue, Apr 24, 2018 at 08:29:14AM -0400, Mikulas Patocka wrote:
> 
> 
> On Mon, 23 Apr 2018, Matthew Wilcox wrote:
> 
> > On Mon, Apr 23, 2018 at 08:06:16PM -0400, Mikulas Patocka wrote:
> > > Some bugs (such as buffer overflows) are better detected
> > > with kmalloc code, so we must test the kmalloc path too.
> > 
> > Well now, this brings up another item for the collective TODO list --
> > implement redzone checks for vmalloc.  Unless this is something already
> > taken care of by kasan or similar.
> 
> The kmalloc overflow testing is also not ideal - it rounds the size up to 
> the next slab size and detects buffer overflows only at this boundary.
> 
> Some times ago, I made a "kmalloc guard" patch that places a magic number 
> immediatelly after the requested size - so that it can detect overflows at 
> byte boundary 
> ( https://www.redhat.com/archives/dm-devel/2014-September/msg00018.html )
> 
> That patch found a bug in crypto code:
> ( http://lkml.iu.edu/hypermail/linux/kernel/1409.1/02325.html )

Is it still worth doing this, now we have kasan?

Re: [PATCH v3] kvmalloc: always use vmalloc if CONFIG_DEBUG_SG

2018-04-23 Thread Matthew Wilcox

On Mon, Apr 23, 2018 at 08:06:16PM -0400, Mikulas Patocka wrote:
> Some bugs (such as buffer overflows) are better detected
> with kmalloc code, so we must test the kmalloc path too.

Well now, this brings up another item for the collective TODO list --
implement redzone checks for vmalloc.  Unless this is something already
taken care of by kasan or similar.

Re: [PATCH] kvmalloc: always use vmalloc if CONFIG_DEBUG_VM

2018-04-21 Thread Matthew Wilcox

On Fri, Apr 20, 2018 at 05:21:26PM -0400, Mikulas Patocka wrote:
> On Fri, 20 Apr 2018, Matthew Wilcox wrote:
> > On Fri, Apr 20, 2018 at 04:54:53PM -0400, Mikulas Patocka wrote:
> > > On Fri, 20 Apr 2018, Michal Hocko wrote:
> > > > No way. This is just wrong! First of all, you will explode most likely
> > > > on many allocations of small sizes. Second, CONFIG_DEBUG_VM tends to be
> > > > enabled quite often.
> > > 
> > > You're an evil person who doesn't want to fix bugs.
> > 
> > Steady on.  There's no need for that.  Michal isn't evil.  Please
> > apologise.
> 
> I see this attitude from Michal again and again.

Fine; then *say that*.  I also see Michal saying "No" a lot.  Sometimes
I agree with him, sometimes I don't.  I think he genuinely wants the best
code in the kernel, and saying "No" is part of it.

> He didn't want to fix vmalloc(GFP_NOIO)

I don't remember that conversation, so I don't know whether I agree with
his reasoning or not.  But we are supposed to be moving away from GFP_NOIO
towards marking regions with memalloc_noio_save() / restore.  If you do
that, you won't need vmalloc(GFP_NOIO).

> he didn't want to fix alloc_pages sleeping when __GFP_NORETRY is used.

The GFP flags are a mess, still.

> So what should I say? Fix them and 
> you won't be evil :-)

No, you should reserve calling somebody evil for truly evil things.

> (he could also fix the oom killer, so that it is triggered when 
> free_memory+cache+free_swap goes beyond a threshold and not when you loop 
> too long in the allocator)

... that also doesn't make somebody evil.

> I already said that we can change it from CONFIG_DEBUG_VM to 
> CONFIG_DEBUG_SG - or to whatever other option you may want, just to make 
> sure that it is enabled in distro debug kernels by default.

Yes, and I think that's the right idea.  So send a v2 and ignore the
replies that are clearly relating to an earlier version of the patch.
Not everybody reads every mail in the thread before responding to one they
find interesting.  Yes, ideally, one would, but sometimes one doesn't.

Re: [PATCH] kvmalloc: always use vmalloc if CONFIG_DEBUG_VM

2018-04-20 Thread Matthew Wilcox

On Fri, Apr 20, 2018 at 04:54:53PM -0400, Mikulas Patocka wrote:
> On Fri, 20 Apr 2018, Michal Hocko wrote:
> > No way. This is just wrong! First of all, you will explode most likely
> > on many allocations of small sizes. Second, CONFIG_DEBUG_VM tends to be
> > enabled quite often.
> 
> You're an evil person who doesn't want to fix bugs.

Steady on.  There's no need for that.  Michal isn't evil.  Please
apologise.

> You refused to fix vmalloc(GFP_NOIO) misbehavior a year ago (did you make 
> some progress with it since that time?) and you refuse to fix kvmalloc 
> misuses.

I understand you're frustrated, but this is not the way to get the problems
fixed.

> I tried this patch on text-only virtual machine and /proc/vmallocinfo 
> shows 614kB more memory. I tried it on a desktop machine with the chrome 
> browser open and /proc/vmallocinfo space is increased by 7MB. So no - this 
> won't exhaust memory and kill the machine.

This is good data, thank you for providing it.

> Arguing that this increases memory consumption is as bogus as arguing that 
> CONFIG_LOCKDEP increses memory consumption. No one is forcing you to 
> enable CONFIG_LOCKDEP and no one is forcing you to enable this kvmalloc 
> test too.

I think there's a real problem which is that CONFIG_DEBUG_VM is too broad.
It inserts code in a *lot* of places, some of which is quite expensive.
We would do better to split it into more granular pieces ... although
an explosion of configuration options isn't great either.  Maybe just
CONFIG_DEBUG_VM and CONFIG_DEBUG_VM_EXPENSIVE.

Michal may be wrong, but he's not evil.

Re: [PATCH] kvmalloc: always use vmalloc if CONFIG_DEBUG_VM

2018-04-20 Thread Matthew Wilcox

On Fri, Apr 20, 2018 at 03:08:52PM +0200, Michal Hocko wrote:
> > In order to detect these bugs reliably I submit this patch that changes
> > kvmalloc to always use vmalloc if CONFIG_DEBUG_VM is turned on.
> 
> No way. This is just wrong! First of all, you will explode most likely
> on many allocations of small sizes. Second, CONFIG_DEBUG_VM tends to be
> enabled quite often.

I think it'll still suit Mikulas' debugging needs if we always use
vmalloc for sizes above PAGE_SIZE?

Re: [PATCH] kvmalloc: always use vmalloc if CONFIG_DEBUG_VM

2018-04-20 Thread Matthew Wilcox

On Thu, Apr 19, 2018 at 12:12:38PM -0400, Mikulas Patocka wrote:
> Unfortunatelly, some kernel code has bugs - it uses kvmalloc and then
> uses DMA-API on the returned memory or frees it with kfree. Such bugs were
> found in the virtio-net driver, dm-integrity or RHEL7 powerpc-specific
> code.

Maybe it's time to have the SG code handle vmalloced pages?  This is
becoming more and more common with vmapped stacks (and some of our
workarounds are hideous -- allocate 4 bytes with kmalloc because we can't
DMA onto the stack any more?).  We already have a few places which do
handle sgs of vmalloced addresses, such as the nx crypto driver:

if (is_vmalloc_addr(start_addr))
sg_addr = page_to_phys(vmalloc_to_page(start_addr))
  + offset_in_page(sg_addr);
else
sg_addr = __pa(sg_addr);

and videobuf:

pg = vmalloc_to_page(virt);
if (NULL == pg)
goto err;
BUG_ON(page_to_pfn(pg) >= (1 << (32 - PAGE_SHIFT)));
sg_set_page([i], pg, PAGE_SIZE, 0);

Yes, there's the potential that we have to produce two SG entries for a
virtually contiguous region if it crosses a page boundary, and our APIs
aren't set up right to make it happen.  But this is something we should
consider fixing ... otherwise we'll end up with dozens of driver hacks.
The videobuf implementation was already copy-and-pasted into the saa7146
driver, for example.

Re: [PATCH 000/109] remove in-kernel calls to syscalls

2018-03-29 Thread Matthew Wilcox

On Thu, Mar 29, 2018 at 01:22:37PM +0200, Dominik Brodowski wrote:
> At least on 64-bit x86, it will likely be a hard requirement from v4.17
> onwards to not call system call functions in the kernel: It is better to
> use use a different calling convention for system calls there, where 
> struct pt_regs is decoded on-the-fly in a syscall wrapper which then hands
> processing over to the actual syscall function. This means that only those
> parameters which are actually needed for a specific syscall are passed on
> during syscall entry, instead of filling in six CPU registers with random
> user space content all the time (which may cause serious trouble down the
> call chain).[*]

How do we stop new ones from springing up?  Some kind of linker trick
like was used to, er, "dissuade" people from using gets()?

Re: [PATCH v2 6/8] page_frag_cache: Use a mask instead of offset

2018-03-22 Thread Matthew Wilcox

On Thu, Mar 22, 2018 at 09:41:57AM -0700, Matthew Wilcox wrote:
> On Thu, Mar 22, 2018 at 09:22:31AM -0700, Alexander Duyck wrote:
> > You could just use the pfc->mask here instead of size - 1 just to
> > avoid having to do the subtraction more than once assuming the
> > compiler doesn't optimize it.
> 
> Either way I'm assuming a compiler optimisation -- that it won't reload
> from memory, or that it'll remember the subtraction.  I don't much care
> which, and I'll happily use the page_frag_cache_mask() if that reads better
> for you.

Looks like it does reload from memory if I make that change.  Before:

37e7:   c7 43 08 ff 7f 00 00movl   $0x7fff,0x8(%rbx)
37ee:   b9 00 80 00 00  mov$0x8000,%ecx
37f3:   be ff 7f 00 00  mov$0x7fff,%esi
37f8:   ba 00 80 00 00  mov$0x8000,%edx
...
380b:   01 70 1cadd%esi,0x1c(%rax)

After:

37e7:   c7 43 08 ff 7f 00 00movl   $0x7fff,0x8(%rbx)
37ee:   b9 00 80 00 00  mov$0x8000,%ecx
37f3:   ba 00 80 00 00  mov$0x8000,%edx
...
3806:   8b 73 08mov0x8(%rbx),%esi
3809:   01 70 1cadd%esi,0x1c(%rax)

Of course, it's shorter because it's fewer bytes to reload from memory
than it is to put a 32-bit immediate in the instruction stream, but
it's one additional memory reference (cache-hot, of course).  I don't
really care because it's the cold path.

Re: [PATCH v2 1/8] page_frag_cache: Remove pfmemalloc bool

2018-03-22 Thread Matthew Wilcox

On Thu, Mar 22, 2018 at 09:39:40AM -0700, Alexander Duyck wrote:
> So I was just thinking about this and it would probably make more
> sense to look at addressing this after you take care of your
> conversion from size/offset to a mask. One thing with the mask is that
> it should never reach 64K since that is the largest page size if I
> recall. With that being the case we could look at dropping mask to a
> u16 value and then add a u16 flags field where you could store things
> like this. Then you could avoid having to do the masking and math you
> are having to do below.

With the bit being in the top bit, it's actually no maths at all in the
caller; it only looks like it in C.  Here's what GCC ends up doing:

 e66:   e8 00 00 00 00  callq  e6b <__netdev_alloc_skb+0x7b>
e67: R_X86_64_PC32  page_frag_alloc-0x4
 e6b:   44 8b 3d 00 00 00 00mov0x0(%rip),%r15d
...
 e8c:   45 85 fftest   %r15d,%r15d
 e8f:   79 04   jnse95 <__netdev_alloc_skb+0xa5>
 e91:   80 48 78 08 orb$0x8,0x78(%rax)
 e95:   80 48 76 20 orb$0x20,0x76(%rax)

ie it's testing the top bit by looking at the sign bit.  If I move it to
the second-top bit (1 << 30), it does this instead:

 e66:   e8 00 00 00 00  callq  e6b <__netdev_alloc_skb+0x7b>
e67: R_X86_64_PC32  page_frag_alloc-0x4
 e6b:   44 8b 2d 00 00 00 00mov0x0(%rip),%r13d
...
 e75:   41 81 e5 00 00 00 40and$0x4000,%r13d
...
 e93:   45 85 edtest   %r13d,%r13d
 e96:   74 04   je e9c <__netdev_alloc_skb+0xac>
 e98:   80 48 78 08 orb$0x8,0x78(%rax)
 e9c:   80 48 76 20 orb$0x20,0x76(%rax)

Changing mask to an unsigned short and adding a bool pfmemalloc to the
struct, I get:

 e66:   e8 00 00 00 00  callq  e6b <__netdev_alloc_skb+0x7b>
e67: R_X86_64_PC32  page_frag_alloc-0x4
 e6b:   44 0f b6 3d 00 00 00movzbl 0x0(%rip),%r15d
 e72:   00 
...
 e8d:   45 84 fftest   %r15b,%r15b
 e90:   74 04   je e96 <__netdev_alloc_skb+0xa6>
 e92:   80 48 78 08 orb$0x8,0x78(%rax)
 e96:   80 48 76 20 orb$0x20,0x76(%rax)

actually one byte less efficient code due to movzbl being one byte longer.

Re: [PATCH v2 6/8] page_frag_cache: Use a mask instead of offset

2018-03-22 Thread Matthew Wilcox

On Thu, Mar 22, 2018 at 09:22:31AM -0700, Alexander Duyck wrote:
> On Thu, Mar 22, 2018 at 8:31 AM, Matthew Wilcox <wi...@infradead.org> wrote:
> > By combining 'va' and 'offset' into 'addr' and using a mask instead,
> > we can save a compare-and-branch in the fast-path of the allocator.
> > This removes 4 instructions on x86 (both 32 and 64 bit).
> 
> What is the point of renaming "va"? I'm seeing a lot of unneeded
> renaming in these patches that doesn't really seem needed and is just
> making things harder to review.

By renaming 'va', I made sure that I saw everywhere that 'va' was touched,
and reviewed it to be sure it was still correct with the new meaning.
The advantage of keeping that in the patch submission, rather than
renaming it back again, is that you can see everywhere that it's been
touched and verify that for yourself.

> > We can avoid storing the mask at all if we know that we're only allocating
> > a single page.  This shrinks page_frag_cache from 12 to 8 bytes on 32-bit
> > CONFIG_BASE_SMALL build.
> >
> > Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
> 
> So I am not really a fan of CONFIG_BASE_SMALL in general, so
> advertising gains in size is just going back down the reducing size at
> the expense of performance train of thought.

There's no tradeoff for performance *in this patch* with
CONFIG_BASE_SMALL.  Indeed, being able to assume that the cache contains a
single PAGE_SIZE page reduces the number of instructions by two on x86-64
(and is neutral on x86-32).  IIRC it saves a register, so there's one fewer
'push' at the beginning of the function and one fewer 'pop' at the end.

I think the more compelling argument for conditioning the number of pages
allocated on CONFIG_BASE_SMALL is that a machine which needs to shrink
its data structures so badly isn't going to have 32k of memory available,
nor want to spend it on a networking allocation.  Eric's commit which
introduced NETDEV_FRAG_PAGE_MAX_ORDER back in 2012 (69b08f62e174) didn't
mention small machines as a consideration.

> Do we know for certain that a higher order page is always aligned to
> the size of the higher order page itself? That is one thing I have
> never been certain about. I know for example there are head and tail
> pages so I was never certain if it was possible to create a higher
> order page that is not aligned to to the size of the page itself.

It's intrinsic to the buddy allocator that pages are naturally aligned
to their order.  There's a lot of code in the kernel which relies on
it, including much of the mm (particularly THP).  I suppose one could
construct a non-aligned compound page, but it'd be really weird, and you'd
have to split it up manually before handing it back to the page allocator.
I don't see this ever changing.

> >  struct page_frag_cache {
> > -   void * va;
> > +   void *addr;
> >  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> > -   __u16 offset;
> > -   __u16 size;
> > -#else
> > -   __u32 offset;
> > +   unsigned int mask;
> 
> So this is just an akward layout. You now have essentially:
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> #else
> unsigned int mask;
> #endif

Huh?  There's a '-' in front of the '#else'.  It looks like this:

struct page_frag_cache {
void *addr;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
unsigned int mask;
#endif
/* we maintain a pagecount bias, so that we dont dirty cache line
 * containing page->_refcount every time we allocate a fragment.
 */
unsigned intpagecnt_bias;
};

> > @@ -4364,27 +4361,24 @@ static struct page *__page_frag_cache_refill(struct 
> > page_frag_cache *pfc,
> > PAGE_FRAG_CACHE_MAX_ORDER);PAGE_SIZE < 
> > PAGE_FRAG_CACHE_MAX_SIZE)
> > if (page)
> > size = PAGE_FRAG_CACHE_MAX_SIZE;
> > -   pfc->size = size;
> > +   pfc->mask = size - 1;
> >  #endif
> > if (unlikely(!page))
> > page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> > if (!page) {
> > -   pfc->va = NULL;
> > +   pfc->addr = NULL;
> > return NULL;
> > }
> >
> > -   pfc->va = page_address(page);
> > -
> > /* Using atomic_set() would break get_page_unless_zero() users. */
> > page_ref_add(page, size - 1);
> 
> You could just use the pfc->mask here instead of size - 1 just to
> avoid having to do the subtraction more than once assuming the
> compiler doesn't optimize it.

Either way I'm assuming a compiler optimisation -- that it won't reload
from memory, or that it'll remember the subtraction.  I don't much care
which, and I'll happily use the page_frag_cache_mask() if that reads better
for you.

[PATCH v2 6/8] page_frag_cache: Use a mask instead of offset

2018-03-22 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

By combining 'va' and 'offset' into 'addr' and using a mask instead,
we can save a compare-and-branch in the fast-path of the allocator.
This removes 4 instructions on x86 (both 32 and 64 bit).

We can avoid storing the mask at all if we know that we're only allocating
a single page.  This shrinks page_frag_cache from 12 to 8 bytes on 32-bit
CONFIG_BASE_SMALL build.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/mm_types.h | 12 +++-
 mm/page_alloc.c  | 40 +++-
 2 files changed, 22 insertions(+), 30 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0defff9e3c0e..ebe93edec752 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -225,12 +225,9 @@ struct page {
 #define PFC_MEMALLOC   (1U << 31)
 
 struct page_frag_cache {
-   void * va;
+   void *addr;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   __u16 offset;
-   __u16 size;
-#else
-   __u32 offset;
+   unsigned int mask;
 #endif
/* we maintain a pagecount bias, so that we dont dirty cache line
 * containing page->_refcount every time we allocate a fragment.
@@ -239,6 +236,11 @@ struct page_frag_cache {
 };
 
 #define page_frag_cache_pfmemalloc(pfc)((pfc)->pagecnt_bias & 
PFC_MEMALLOC)
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+#define page_frag_cache_mask(pfc)  (pfc)->mask
+#else
+#define page_frag_cache_mask(pfc)  (~PAGE_MASK)
+#endif
 
 typedef unsigned long vm_flags_t;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5a2e3e293079..d15a5348a8e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4336,22 +4336,19 @@ EXPORT_SYMBOL(free_pages);
  * drivers to provide a backing region of memory for use as either an
  * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
  */
-static struct page *__page_frag_cache_refill(struct page_frag_cache *pfc,
+static void *__page_frag_cache_refill(struct page_frag_cache *pfc,
 gfp_t gfp_mask)
 {
unsigned int size = PAGE_SIZE;
struct page *page = NULL;
-   struct page *old = pfc->va ? virt_to_page(pfc->va) : NULL;
+   struct page *old = pfc->addr ? virt_to_head_page(pfc->addr) : NULL;
gfp_t gfp = gfp_mask;
unsigned int pagecnt_bias = pfc->pagecnt_bias & ~PFC_MEMALLOC;
 
/* If all allocations have been freed, we can reuse this page */
if (old && page_ref_sub_and_test(old, pagecnt_bias)) {
page = old;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   /* if size can vary use size else just use PAGE_SIZE */
-   size = pfc->size;
-#endif
+   size = page_frag_cache_mask(pfc) + 1;
/* Page count is 0, we can safely set it */
set_page_count(page, size);
goto reset;
@@ -4364,27 +4361,24 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *pfc,
PAGE_FRAG_CACHE_MAX_ORDER);
if (page)
size = PAGE_FRAG_CACHE_MAX_SIZE;
-   pfc->size = size;
+   pfc->mask = size - 1;
 #endif
if (unlikely(!page))
page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
if (!page) {
-   pfc->va = NULL;
+   pfc->addr = NULL;
return NULL;
}
 
-   pfc->va = page_address(page);
-
/* Using atomic_set() would break get_page_unless_zero() users. */
page_ref_add(page, size - 1);
 reset:
-   /* reset page count bias and offset to start of new frag */
pfc->pagecnt_bias = size;
if (page_is_pfmemalloc(page))
pfc->pagecnt_bias |= PFC_MEMALLOC;
-   pfc->offset = size;
+   pfc->addr = page_address(page) + size;
 
-   return page;
+   return pfc->addr;
 }
 
 void __page_frag_cache_drain(struct page *page, unsigned int count)
@@ -4405,24 +4399,20 @@ EXPORT_SYMBOL(__page_frag_cache_drain);
 void *page_frag_alloc(struct page_frag_cache *pfc,
  unsigned int size, gfp_t gfp_mask)
 {
-   struct page *page;
-   int offset;
+   void *addr = pfc->addr;
+   unsigned int offset = (unsigned long)addr & page_frag_cache_mask(pfc);
 
-   if (unlikely(!pfc->va)) {
-refill:
-   page = __page_frag_cache_refill(pfc, gfp_mask);
-   if (!page)
+   if (unlikely(offset < size)) {
+   addr = __page_frag_cache_refill(pfc, gfp_mask);
+   if (!addr)
return NULL;
}
 
-   offset = pfc->offset - size;
-   if (unlikely(offset < 0))
-   goto refill;
-
+   addr -= size;
+   pfc->addr = addr;
pfc->pagecnt_bias--;
-   pfc->offset = offset;
 
-   return pfc->va + offset;
+   return addr;
 }
 EXPORT_SYMBOL(page_frag_alloc);
 
-- 
2.16.2

[PATCH v2 2/8] page_frag_cache: Move slowpath code from page_frag_alloc

2018-03-22 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Put all the unlikely code in __page_frag_cache_refill to make the
fastpath code more obvious.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 mm/page_alloc.c | 70 -
 1 file changed, 34 insertions(+), 36 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 61366f23e8c8..6d2c106f4e5d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4339,20 +4339,50 @@ EXPORT_SYMBOL(free_pages);
 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 gfp_t gfp_mask)
 {
+   unsigned int size = PAGE_SIZE;
struct page *page = NULL;
+   struct page *old = nc->va ? virt_to_page(nc->va) : NULL;
gfp_t gfp = gfp_mask;
+   unsigned int pagecnt_bias = nc->pagecnt_bias & ~PFC_MEMALLOC;
+
+   /* If all allocations have been freed, we can reuse this page */
+   if (old && page_ref_sub_and_test(old, pagecnt_bias)) {
+   page = old;
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+   /* if size can vary use size else just use PAGE_SIZE */
+   size = nc->size;
+#endif
+   /* Page count is 0, we can safely set it */
+   set_page_count(page, size);
+   goto reset;
+   }
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
__GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
-   nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
+   if (page)
+   size = PAGE_FRAG_CACHE_MAX_SIZE;
+   nc->size = size;
 #endif
if (unlikely(!page))
page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+   if (!page) {
+   nc->va = NULL;
+   return NULL;
+   }
+
+   nc->va = page_address(page);
 
-   nc->va = page ? page_address(page) : NULL;
+   /* Using atomic_set() would break get_page_unless_zero() users. */
+   page_ref_add(page, size - 1);
+reset:
+   /* reset page count bias and offset to start of new frag */
+   nc->pagecnt_bias = size;
+   if (page_is_pfmemalloc(page))
+   nc->pagecnt_bias |= PFC_MEMALLOC;
+   nc->offset = size;
 
return page;
 }
@@ -4375,7 +4405,6 @@ EXPORT_SYMBOL(__page_frag_cache_drain);
 void *page_frag_alloc(struct page_frag_cache *nc,
  unsigned int fragsz, gfp_t gfp_mask)
 {
-   unsigned int size = PAGE_SIZE;
struct page *page;
int offset;
 
@@ -4384,42 +4413,11 @@ void *page_frag_alloc(struct page_frag_cache *nc,
page = __page_frag_cache_refill(nc, gfp_mask);
if (!page)
return NULL;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   /* if size can vary use size else just use PAGE_SIZE */
-   size = nc->size;
-#endif
-   /* Even if we own the page, we do not use atomic_set().
-* This would break get_page_unless_zero() users.
-*/
-   page_ref_add(page, size - 1);
-
-   /* reset page count bias and offset to start of new frag */
-   nc->pagecnt_bias = size;
-   if (page_is_pfmemalloc(page))
-   nc->pagecnt_bias |= PFC_MEMALLOC;
-   nc->offset = size;
}
 
offset = nc->offset - fragsz;
-   if (unlikely(offset < 0)) {
-   unsigned int pagecnt_bias = nc->pagecnt_bias & ~PFC_MEMALLOC;
-   page = virt_to_page(nc->va);
-
-   if (!page_ref_sub_and_test(page, pagecnt_bias))
-   goto refill;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   /* if size can vary use size else just use PAGE_SIZE */
-   size = nc->size;
-#endif
-   /* OK, page count is 0, we can safely set it */
-   set_page_count(page, size);
-
-   /* reset page count bias and offset to start of new frag */
-   nc->pagecnt_bias = size | (nc->pagecnt_bias - pagecnt_bias);
-   offset = size - fragsz;
-   }
+   if (unlikely(offset < 0))
+   goto refill;
 
nc->pagecnt_bias--;
nc->offset = offset;
-- 
2.16.2

[PATCH v2 1/8] page_frag_cache: Remove pfmemalloc bool

2018-03-22 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Save 4/8 bytes by moving the pfmemalloc indicator from its own bool
to the top bit of pagecnt_bias.  This has no effect on the fastpath
of the allocator since the pagecnt_bias cannot go negative.  It's
a couple of extra instructions in the slowpath.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/mm_types.h | 4 +++-
 mm/page_alloc.c  | 8 +---
 net/core/skbuff.c| 5 ++---
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fd1af6b9591d..a63b138ad1a4 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -218,6 +218,7 @@ struct page {
 
 #define PAGE_FRAG_CACHE_MAX_SIZE   __ALIGN_MASK(32768, ~PAGE_MASK)
 #define PAGE_FRAG_CACHE_MAX_ORDER  get_order(PAGE_FRAG_CACHE_MAX_SIZE)
+#define PFC_MEMALLOC   (1U << 31)
 
 struct page_frag_cache {
void * va;
@@ -231,9 +232,10 @@ struct page_frag_cache {
 * containing page->_refcount every time we allocate a fragment.
 */
unsigned intpagecnt_bias;
-   bool pfmemalloc;
 };
 
+#define page_frag_cache_pfmemalloc(pfc)((pfc)->pagecnt_bias & 
PFC_MEMALLOC)
+
 typedef unsigned long vm_flags_t;
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 635d7dd29d7f..61366f23e8c8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4395,16 +4395,18 @@ void *page_frag_alloc(struct page_frag_cache *nc,
page_ref_add(page, size - 1);
 
/* reset page count bias and offset to start of new frag */
-   nc->pfmemalloc = page_is_pfmemalloc(page);
nc->pagecnt_bias = size;
+   if (page_is_pfmemalloc(page))
+   nc->pagecnt_bias |= PFC_MEMALLOC;
nc->offset = size;
}
 
offset = nc->offset - fragsz;
if (unlikely(offset < 0)) {
+   unsigned int pagecnt_bias = nc->pagecnt_bias & ~PFC_MEMALLOC;
page = virt_to_page(nc->va);
 
-   if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
+   if (!page_ref_sub_and_test(page, pagecnt_bias))
goto refill;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
@@ -4415,7 +4417,7 @@ void *page_frag_alloc(struct page_frag_cache *nc,
set_page_count(page, size);
 
/* reset page count bias and offset to start of new frag */
-   nc->pagecnt_bias = size;
+   nc->pagecnt_bias = size | (nc->pagecnt_bias - pagecnt_bias);
offset = size - fragsz;
}
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0bb0d8877954..54bbde8f7541 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -412,7 +412,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, 
unsigned int len,
 
nc = this_cpu_ptr(_alloc_cache);
data = page_frag_alloc(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_pfmemalloc(nc);
 
local_irq_restore(flags);
 
@@ -485,8 +485,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, 
unsigned int len,
return NULL;
}
 
-   /* use OR instead of assignment to avoid clearing of bits in mask */
-   if (nc->page.pfmemalloc)
+   if (page_frag_cache_pfmemalloc(>page))
skb->pfmemalloc = 1;
skb->head_frag = 1;
 
-- 
2.16.2

[PATCH v2 4/8] page_frag_cache: Rename fragsz to size

2018-03-22 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

The 'size' variable name used to be used for the page size.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 mm/page_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c9fc76135dd8..5a2e3e293079 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4403,7 +4403,7 @@ void __page_frag_cache_drain(struct page *page, unsigned 
int count)
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
 void *page_frag_alloc(struct page_frag_cache *pfc,
- unsigned int fragsz, gfp_t gfp_mask)
+ unsigned int size, gfp_t gfp_mask)
 {
struct page *page;
int offset;
@@ -4415,7 +4415,7 @@ void *page_frag_alloc(struct page_frag_cache *pfc,
return NULL;
}
 
-   offset = pfc->offset - fragsz;
+   offset = pfc->offset - size;
if (unlikely(offset < 0))
goto refill;
 
-- 
2.16.2

[PATCH v2 8/8] page_frag: Account allocations

2018-03-22 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Note the number of pages currently used in page_frag allocations.
This may help diagnose leaks in page_frag users.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/mmzone.h |  3 ++-
 mm/page_alloc.c| 10 +++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7522a6987595..ed6be33dcc7a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -139,10 +139,10 @@ enum zone_stat_item {
NR_ZONE_ACTIVE_FILE,
NR_ZONE_UNEVICTABLE,
NR_ZONE_WRITE_PENDING,  /* Count of dirty, writeback and unstable pages 
*/
+   /* Second 128 byte cacheline */
NR_MLOCK,   /* mlock()ed pages found and moved off LRU */
NR_PAGETABLE,   /* used for pagetables */
NR_KERNEL_STACK_KB, /* measured in KiB */
-   /* Second 128 byte cacheline */
NR_BOUNCE,
 #if IS_ENABLED(CONFIG_ZSMALLOC)
NR_ZSPAGES, /* allocated in zsmalloc */
@@ -175,6 +175,7 @@ enum node_stat_item {
NR_SHMEM_THPS,
NR_SHMEM_PMDMAPPED,
NR_ANON_THPS,
+   NR_PAGE_FRAG,
NR_UNSTABLE_NFS,/* NFS unstable pages */
NR_VMSCAN_WRITE,
NR_VMSCAN_IMMEDIATE,/* Prioritise for reclaim when writeback ends */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b9beafa5d2a5..5a9441b46604 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4382,6 +4382,7 @@ static void *__page_frag_cache_refill(struct 
page_frag_cache *pfc,
return NULL;
}
 
+   inc_node_page_state(page, NR_PAGE_FRAG);
/* Using atomic_set() would break get_page_unless_zero() users. */
page_ref_add(page, size - 1);
 reset:
@@ -4460,8 +4461,10 @@ void page_frag_free(void *addr)
 {
struct page *page = virt_to_head_page(addr);
 
-   if (unlikely(put_page_testzero(page)))
+   if (unlikely(put_page_testzero(page))) {
+   dec_node_page_state(page, NR_PAGE_FRAG);
__free_pages_ok(page, compound_order(page));
+   }
 }
 EXPORT_SYMBOL(page_frag_free);
 
@@ -4769,7 +4772,7 @@ void show_free_areas(unsigned int filter, nodemask_t 
*nodemask)
" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
-   " free:%lu free_pcp:%lu free_cma:%lu\n",
+   " free:%lu free_pcp:%lu free_cma:%lu page_frag:%lu\n",
global_node_page_state(NR_ACTIVE_ANON),
global_node_page_state(NR_INACTIVE_ANON),
global_node_page_state(NR_ISOLATED_ANON),
@@ -4788,7 +4791,8 @@ void show_free_areas(unsigned int filter, nodemask_t 
*nodemask)
global_zone_page_state(NR_BOUNCE),
global_zone_page_state(NR_FREE_PAGES),
free_pcp,
-   global_zone_page_state(NR_FREE_CMA_PAGES));
+   global_zone_page_state(NR_FREE_CMA_PAGES),
+   global_node_page_state(NR_PAGE_FRAG));
 
for_each_online_pgdat(pgdat) {
if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
-- 
2.16.2

[PATCH v2 3/8] page_frag_cache: Rename 'nc' to 'pfc'

2018-03-22 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

This name was a legacy from the 'netdev_alloc_cache' days.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 mm/page_alloc.c | 34 +-
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d2c106f4e5d..c9fc76135dd8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4336,21 +4336,21 @@ EXPORT_SYMBOL(free_pages);
  * drivers to provide a backing region of memory for use as either an
  * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
  */
-static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+static struct page *__page_frag_cache_refill(struct page_frag_cache *pfc,
 gfp_t gfp_mask)
 {
unsigned int size = PAGE_SIZE;
struct page *page = NULL;
-   struct page *old = nc->va ? virt_to_page(nc->va) : NULL;
+   struct page *old = pfc->va ? virt_to_page(pfc->va) : NULL;
gfp_t gfp = gfp_mask;
-   unsigned int pagecnt_bias = nc->pagecnt_bias & ~PFC_MEMALLOC;
+   unsigned int pagecnt_bias = pfc->pagecnt_bias & ~PFC_MEMALLOC;
 
/* If all allocations have been freed, we can reuse this page */
if (old && page_ref_sub_and_test(old, pagecnt_bias)) {
page = old;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
/* if size can vary use size else just use PAGE_SIZE */
-   size = nc->size;
+   size = pfc->size;
 #endif
/* Page count is 0, we can safely set it */
set_page_count(page, size);
@@ -4364,25 +4364,25 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
PAGE_FRAG_CACHE_MAX_ORDER);
if (page)
size = PAGE_FRAG_CACHE_MAX_SIZE;
-   nc->size = size;
+   pfc->size = size;
 #endif
if (unlikely(!page))
page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
if (!page) {
-   nc->va = NULL;
+   pfc->va = NULL;
return NULL;
}
 
-   nc->va = page_address(page);
+   pfc->va = page_address(page);
 
/* Using atomic_set() would break get_page_unless_zero() users. */
page_ref_add(page, size - 1);
 reset:
/* reset page count bias and offset to start of new frag */
-   nc->pagecnt_bias = size;
+   pfc->pagecnt_bias = size;
if (page_is_pfmemalloc(page))
-   nc->pagecnt_bias |= PFC_MEMALLOC;
-   nc->offset = size;
+   pfc->pagecnt_bias |= PFC_MEMALLOC;
+   pfc->offset = size;
 
return page;
 }
@@ -4402,27 +4402,27 @@ void __page_frag_cache_drain(struct page *page, 
unsigned int count)
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
-void *page_frag_alloc(struct page_frag_cache *nc,
+void *page_frag_alloc(struct page_frag_cache *pfc,
  unsigned int fragsz, gfp_t gfp_mask)
 {
struct page *page;
int offset;
 
-   if (unlikely(!nc->va)) {
+   if (unlikely(!pfc->va)) {
 refill:
-   page = __page_frag_cache_refill(nc, gfp_mask);
+   page = __page_frag_cache_refill(pfc, gfp_mask);
if (!page)
return NULL;
}
 
-   offset = nc->offset - fragsz;
+   offset = pfc->offset - fragsz;
if (unlikely(offset < 0))
goto refill;
 
-   nc->pagecnt_bias--;
-   nc->offset = offset;
+   pfc->pagecnt_bias--;
+   pfc->offset = offset;
 
-   return nc->va + offset;
+   return pfc->va + offset;
 }
 EXPORT_SYMBOL(page_frag_alloc);
 
-- 
2.16.2

[PATCH v2 5/8] page_frag_cache: Save memory on small machines

2018-03-22 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Only allocate a single page if CONFIG_BASE_SMALL is set.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/mm_types.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a63b138ad1a4..0defff9e3c0e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -216,7 +216,11 @@ struct page {
 #endif
 } _struct_page_alignment;
 
+#if CONFIG_BASE_SMALL
+#define PAGE_FRAG_CACHE_MAX_SIZE   PAGE_SIZE
+#else
 #define PAGE_FRAG_CACHE_MAX_SIZE   __ALIGN_MASK(32768, ~PAGE_MASK)
+#endif
 #define PAGE_FRAG_CACHE_MAX_ORDER  get_order(PAGE_FRAG_CACHE_MAX_SIZE)
 #define PFC_MEMALLOC   (1U << 31)
 
-- 
2.16.2

[PATCH v2 0/8] page_frag_cache improvements

2018-03-22 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Version 1 was completely wrong-headed and I have repented of the error
of my ways.  Thanks for educating me.

I still think it's possible to improve on the current state of the
page_frag allocator, and here are eight patches, each of which I think
represents an improvement.  They're not all that interlinked, although
there will be textual conflicts, so I'll be happy to revise and drop
any that are not actual improvements.

I have discovered (today), much to my chagrin, that testing using trinity
in KVM doesn't actually test the page_frag allocator.  I don't understand
why not.  So, this turns out to only be compile tested.  Sorry.

The net effect of all these patches is a reduction of four instructions
in the fastpath of the allocator on x86.  The page_frag_cache structure
also shrinks, to as small as 8 bytes on 32-bit with CONFIG_BASE_SMALL.

The last patch is probably wrong.  It'll definitely be inaccurate
because the call to page_frag_free() may not be the call which frees
a page; there's a really unlikely race where the page cache finds a
stale RCU pointer, bumps its refcount, discovers it's not the page it
was looking for and calls put_page(), which might end up being the last
reference count.  We can do something about that inaccuracy, but I don't
even know if this is the best approach to accounting these pages.

Matthew Wilcox (8):
  page_frag_cache: Remove pfmemalloc bool
  page_frag_cache: Move slowpath code from page_frag_alloc
  page_frag_cache: Rename 'nc' to 'pfc'
  page_frag_cache: Rename fragsz to size
  page_frag_cache: Save memory on small machines
  page_frag_cache: Use a mask instead of offset
  page_frag: Update documentation
  page_frag: Account allocations

 Documentation/vm/page_frags |  42 ---
 Documentation/vm/page_frags.rst |  24 +++
 include/linux/mm_types.h|  20 --
 include/linux/mmzone.h  |   3 +-
 mm/page_alloc.c | 155 
 net/core/skbuff.c   |   5 +-
 6 files changed, 135 insertions(+), 114 deletions(-)
 delete mode 100644 Documentation/vm/page_frags
 create mode 100644 Documentation/vm/page_frags.rst

-- 
2.16.2

[PATCH v2 7/8] page_frag: Update documentation

2018-03-22 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

 - Rename Documentation/vm/page_frags to page_frags.rst
 - Change page_frags.rst to be a user's guide rather than implementation
   detail.
 - Add kernel-doc for the page_frag allocator
 - Move implementation details to the comments in page_alloc.c

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 Documentation/vm/page_frags | 42 ---
 Documentation/vm/page_frags.rst | 24 
 mm/page_alloc.c | 63 -
 3 files changed, 74 insertions(+), 55 deletions(-)
 delete mode 100644 Documentation/vm/page_frags
 create mode 100644 Documentation/vm/page_frags.rst

diff --git a/Documentation/vm/page_frags b/Documentation/vm/page_frags
deleted file mode 100644
index a6714565dbf9..
--- a/Documentation/vm/page_frags
+++ /dev/null
@@ -1,42 +0,0 @@
-Page fragments
---
-
-A page fragment is an arbitrary-length arbitrary-offset area of memory
-which resides within a 0 or higher order compound page.  Multiple
-fragments within that page are individually refcounted, in the page's
-reference counter.
-
-The page_frag functions, page_frag_alloc and page_frag_free, provide a
-simple allocation framework for page fragments.  This is used by the
-network stack and network device drivers to provide a backing region of
-memory for use as either an sk_buff->head, or to be used in the "frags"
-portion of skb_shared_info.
-
-In order to make use of the page fragment APIs a backing page fragment
-cache is needed.  This provides a central point for the fragment allocation
-and tracks allows multiple calls to make use of a cached page.  The
-advantage to doing this is that multiple calls to get_page can be avoided
-which can be expensive at allocation time.  However due to the nature of
-this caching it is required that any calls to the cache be protected by
-either a per-cpu limitation, or a per-cpu limitation and forcing interrupts
-to be disabled when executing the fragment allocation.
-
-The network stack uses two separate caches per CPU to handle fragment
-allocation.  The netdev_alloc_cache is used by callers making use of the
-__netdev_alloc_frag and __netdev_alloc_skb calls.  The napi_alloc_cache is
-used by callers of the __napi_alloc_frag and __napi_alloc_skb calls.  The
-main difference between these two calls is the context in which they may be
-called.  The "netdev" prefixed functions are usable in any context as these
-functions will disable interrupts, while the "napi" prefixed functions are
-only usable within the softirq context.
-
-Many network device drivers use a similar methodology for allocating page
-fragments, but the page fragments are cached at the ring or descriptor
-level.  In order to enable these cases it is necessary to provide a generic
-way of tearing down a page cache.  For this reason __page_frag_cache_drain
-was implemented.  It allows for freeing multiple references from a single
-page via a single call.  The advantage to doing this is that it allows for
-cleaning up the multiple references that were added to a page in order to
-avoid calling get_page per allocation.
-
-Alexander Duyck, Nov 29, 2016.
diff --git a/Documentation/vm/page_frags.rst b/Documentation/vm/page_frags.rst
new file mode 100644
index ..e675bfad6710
--- /dev/null
+++ b/Documentation/vm/page_frags.rst
@@ -0,0 +1,24 @@
+==
+Page fragments
+==
+
+:Author: Alexander Duyck
+
+A page fragment is a physically contiguous area of memory that is smaller
+than a page.  It may cross a page boundary, and may be allocated at
+an arbitrary alignment.
+
+The page fragment allocator is optimised for very fast allocation
+of arbitrary-sized objects which will likely be freed soon.  It does
+not take any locks, relying on the caller to ensure that simultaneous
+allocations from the same page_frag_cache cannot occur.  The allocator
+does not support red zones or poisoning.  If the user has alignment
+requirements, rounding the size of each object allocated from the cache
+will ensure that all objects are aligned.  Do not attempt to allocate
+0 bytes; it is not checked for and will end badly.
+
+Functions
+=
+
+.. kernel-doc:: mm/page_alloc.c
+   :functions: page_frag_alloc page_frag_free __page_frag_cache_drain
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d15a5348a8e4..b9beafa5d2a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4326,15 +4326,27 @@ void free_pages(unsigned long addr, unsigned int order)
 EXPORT_SYMBOL(free_pages);
 
 /*
- * Page Fragment:
- *  An arbitrary-length arbitrary-offset area of memory which resides
- *  within a 0 or higher order page.  Multiple fragments within that page
- *  are individually refcounted, in the page's reference counter.
- *
- * The page_frag functions below provide a simple allocation framework for
- * page fragments.  This is

Re: [PATCH] mlx5: Remove call to ida_pre_get

2018-03-20 Thread Matthew Wilcox

On Tue, Mar 20, 2018 at 10:46:20AM -0400, David Miller wrote:
> From: Maor Gottlieb 
> Date: Tue, 20 Mar 2018 14:41:49 +0200
> 
> > Saeed, Matan and I okay with this fix as well, it looks like it
> > shouldn't impact on the insertion rate.
> 
> I've applied this to net-next, thanks everyone.

Thanks, Dave.

I realised why this made sense when it was originally written.  Before
December 2016 (commit 7ad3d4d85c7a), ida_pre_get used to allocate one
bitmap per ida.  I moved it to a percpu variable, and at that point this
stopped making sense.

Re: [RFC 2/2] page_frag_cache: Store metadata in struct page

2018-03-16 Thread Matthew Wilcox

On Thu, Mar 15, 2018 at 02:26:28PM -0700, Alexander Duyck wrote:
> On Thu, Mar 15, 2018 at 12:53 PM, Matthew Wilcox <wi...@infradead.org> wrote:
> > From: Matthew Wilcox <mawil...@microsoft.com>
> >
> > Shrink page_frag_cache from 24 to 8 bytes (a single pointer to the
> > currently-in-use struct page) by using the page's refcount directly
> > (instead of maintaining a bias) and storing our current progress through
> > the page in the same bits currently used for page->index.  We no longer
> > need to reflect the page pfmemalloc state if we're storing the page
> > directly.
> >
> > On the downside, we now call page_address() on every allocation, and we
> > do an atomic_inc() rather than a non-atomic decrement, but we should
> > touch the same number of cachelines and there is far less code (and
> > the code is less complex).
> >
> > Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
> 
> So my concern with this patch is that it is essentially trading off
> CPU performance for reduced size. One of the reasons for getting away
> from using the page pointer is because it is expensive to access the
> page when the ref_count is bouncing on multiple cache lines.

I'm not really interested in trading off SMP scalability for reduced
memory usage; if we see more than a trivial penalty then I'll withdraw
this RFC.  I read & understand the commits that you made to get us to
the current codebase, but there's no history of this particular variation
in the tree.

0e39250845c0f91acc64264709b25f7f9b85c2c3
9451980a6646ed487efce04a9df28f450935683e
540eb7bf0bbedb65277d68ab89ae43cdec3fd6ba

I think that by moving the 'offset' into the struct page, we end up updating
only one cacheline instead of two, which should be a performance win.
I understand your concern about the cacheline bouncing between the freeing
and allocating CPUs.  Is cross-CPU freeing a frequent occurrence?  From
looking at its current usage, it seemed like the allocation and freeing
were usually on the same CPU.

> In
> addition converting from a page to a virtual address is actually more
> expensive then you would think it should be. I went through and
> optimized that the best I could, but there is still a pretty
> significant cost to the call.

Were you able to break down where that cost occurred?  It looks
pretty cheap on x86-32 nohighmem:

343e:   a1 00 00 00 00  mov0x0,%eax
343f: R_386_32  mem_map
3443:   29 f3   sub%esi,%ebx
3445:   89 5a 08mov%ebx,0x8(%edx)
3448:   29 c2   sub%eax,%edx
344a:   c1 fa 05sar$0x5,%edx
344d:   c1 e2 0cshl$0xc,%edx
3450:   8d 84 13 00 00 00 c0lea-0x4000(%ebx,%edx,1),%eax
3457:   8b 5d f8mov-0x8(%ebp),%ebx
345a:   8b 75 fcmov-0x4(%ebp),%esi
345d:   89 ec   mov%ebp,%esp
345f:   5d  pop%ebp
3460:   c3  ret

I did code up a variant which stored the virtual address of the offset
in page->index, but I threw that away after seeing how much more code
it turned into.  Fortunately I had a better idea of how to implement that
early this morning.  I haven't even tested it yet, but it looks like better
generated code:

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 28d9a4c9c5fd..ec38204eb903 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4357,8 +4357,8 @@ struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
unsigned int offset = PAGE_SIZE << compound_order(old);
 
if (offset > size) {
-   old->offset = offset;
-   return old;
+   page = old;
+   goto reset;
}
}
 
@@ -4379,7 +4379,10 @@ struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
if (old)
put_page(old);
nc->page = page;
-   page->offset = PAGE_SIZE << compound_order(page);
+   page->private = (PAGE_SIZE << compound_order(page)) - 1;
+reset:
+   page->index = (unsigned long)page_to_virt(page) +
+   (PAGE_SIZE << compound_order(page));
return page;
 }
 
@@ -4402,20 +4405,26 @@ void *page_frag_alloc(struct page_frag_cache *nc,
  unsigned int size, gfp_t gfp_mask)
 {
struct page *page = nc->page;
-   unsigned int offset = page->offset;
+   unsigned long addr = addr;
+   unsigned int offset = 0;
 
-   if (unlikely(!page || offset < size)) {
+   if (page) {
+   addr = page->index;
+   offset =

Re: [PATCH] mlx5: Remove call to ida_pre_get

2018-03-15 Thread Matthew Wilcox

On Thu, Mar 15, 2018 at 11:58:07PM +, Saeed Mahameed wrote:
> On Wed, 2018-03-14 at 19:57 -0700, Matthew Wilcox wrote:
> > From: Matthew Wilcox <mawil...@microsoft.com>
> > 
> > The mlx5 driver calls ida_pre_get() in a loop for no readily apparent
> > reason.  The driver uses ida_simple_get() which will call
> > ida_pre_get()
> > by itself and there's no need to use ida_pre_get() unless using
> > ida_get_new().
> > 
> 
> Hi Matthew,
> 
> Is this is causing any issues ? or just a simple cleanup ?

I'm removing the API.  At the end of this cleanup, there will be no more
preallocation; instead we will rely on the slab allocator not sucking.

> Adding Maor, the author of this change,
> 
> I believe the idea is to speed up insert_fte (which calls
> ida_simple_get) since insert_fte runs under the FTE write semaphore,
> in this case if ida_pre_get was successful before taking the semaphore
> for all the FTE nodes in the loop, this will be a huge win for
> ida_simple_get which will immediately return success without even
> trying to allocate.

I think that's misguided.  The IDA allocator is only going to allocate
memory once in every 1024 allocations.  Also, it does try to allocate,
even if there are preallocated nodes.  So you're just wasting time,
unfortunately.

[RFC 1/2] mm: Use page->mapping to indicate pfmemalloc

2018-03-15 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

I want to use page->index for a different purpose, so move the pfmemalloc
indicator from page->index to page->mapping.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/mm.h | 16 +---
 mm/page_alloc.c|  8 +++-
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4ef7fb1726ab..06ea71358bda 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1122,6 +1122,9 @@ static inline pgoff_t page_index(struct page *page)
 bool page_mapped(struct page *page);
 struct address_space *page_mapping(struct page *page);
 
+/* page->mapping cannot point into the zero page. */
+#define MAPPING_PFMEMALLOC ((struct address_space *)4)
+
 /*
  * Return true only if the page has been allocated with
  * ALLOC_NO_WATERMARKS and the low watermark was not
@@ -1129,11 +1132,7 @@ struct address_space *page_mapping(struct page *page);
  */
 static inline bool page_is_pfmemalloc(struct page *page)
 {
-   /*
-* Page index cannot be this large so this must be
-* a pfmemalloc page.
-*/
-   return page->index == -1UL;
+   return page->mapping == MAPPING_PFMEMALLOC;
 }
 
 /*
@@ -1142,12 +1141,7 @@ static inline bool page_is_pfmemalloc(struct page *page)
  */
 static inline void set_page_pfmemalloc(struct page *page)
 {
-   page->index = -1UL;
-}
-
-static inline void clear_page_pfmemalloc(struct page *page)
-{
-   page->index = 0;
+   page->mapping = MAPPING_PFMEMALLOC;
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 796ce1b3e0a1..7a9c14214ed2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1047,7 +1047,7 @@ static __always_inline bool free_pages_prepare(struct 
page *page,
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
-   if (PageMappingFlags(page))
+   if (PageMappingFlags(page) || page_is_pfmemalloc(page))
page->mapping = NULL;
if (memcg_kmem_enabled() && PageKmemcg(page))
memcg_kmem_uncharge(page, order);
@@ -1802,8 +1802,8 @@ inline void post_alloc_hook(struct page *page, unsigned 
int order,
set_page_owner(page, order, gfp_flags);
 }
 
-static void prep_new_page(struct page *page, unsigned int order, gfp_t 
gfp_flags,
-   unsigned int 
alloc_flags)
+static void prep_new_page(struct page *page, unsigned int order,
+   gfp_t gfp_flags, unsigned int alloc_flags)
 {
int i;
 
@@ -1824,8 +1824,6 @@ static void prep_new_page(struct page *page, unsigned int 
order, gfp_t gfp_flags
 */
if (alloc_flags & ALLOC_NO_WATERMARKS)
set_page_pfmemalloc(page);
-   else
-   clear_page_pfmemalloc(page);
 }
 
 /*
-- 
2.16.2

[RFC 0/2] Shrink page_frag_cache

2018-03-15 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

I've just learned about the page_frag_cache allocator, and now I want
to use it everywhere ;-)

But before I start using it in other places, I want to see if it can
be improved at all.  The pfmemalloc flag is pretty specific to how the
network stack uses it (with GFP_ATOMIC), and the pagecnt_bias is tricky
to understand.  I think we can do better by just using the fields in
struct page directly.  I don't have a suitable setup for performance
testing this code ... Alex, is there any chance you'd have time to give
this a spin?

Matthew Wilcox (2):
  mm: Use page->mapping to indicate pfmemalloc
  page_frag_cache: Store metadata in struct page

 include/linux/mm.h   |  16 ++
 include/linux/mm_types.h |  17 +-
 mm/page_alloc.c  | 143 ---
 net/core/skbuff.c|   4 +-
 4 files changed, 82 insertions(+), 98 deletions(-)

-- 
2.16.2

[RFC 2/2] page_frag_cache: Store metadata in struct page

2018-03-15 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Shrink page_frag_cache from 24 to 8 bytes (a single pointer to the
currently-in-use struct page) by using the page's refcount directly
(instead of maintaining a bias) and storing our current progress through
the page in the same bits currently used for page->index.  We no longer
need to reflect the page pfmemalloc state if we're storing the page
directly.

On the downside, we now call page_address() on every allocation, and we
do an atomic_inc() rather than a non-atomic decrement, but we should
touch the same number of cachelines and there is far less code (and
the code is less complex).

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/mm_types.h |  17 +-
 mm/page_alloc.c  | 135 ---
 net/core/skbuff.c|   4 +-
 3 files changed, 74 insertions(+), 82 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1c5dea402501..f922cb62bd91 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -90,6 +90,7 @@ struct page {
union {
pgoff_t index;  /* Our offset within mapping. */
void *freelist; /* sl[aou]b first free object */
+   unsigned int offset;/* page_frag highwater mark */
/* page_deferred_list().prev-- second tail page */
};
 
@@ -219,22 +220,8 @@ struct page {
 #endif
 } _struct_page_alignment;
 
-#define PAGE_FRAG_CACHE_MAX_SIZE   __ALIGN_MASK(32768, ~PAGE_MASK)
-#define PAGE_FRAG_CACHE_MAX_ORDER  get_order(PAGE_FRAG_CACHE_MAX_SIZE)
-
 struct page_frag_cache {
-   void * va;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   __u16 offset;
-   __u16 size;
-#else
-   __u32 offset;
-#endif
-   /* we maintain a pagecount bias, so that we dont dirty cache line
-* containing page->_refcount every time we allocate a fragment.
-*/
-   unsigned intpagecnt_bias;
-   bool pfmemalloc;
+   struct page *page;
 };
 
 typedef unsigned long vm_flags_t;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7a9c14214ed2..f8a176aab287 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4319,34 +4319,72 @@ void free_pages(unsigned long addr, unsigned int order)
 EXPORT_SYMBOL(free_pages);
 
 /*
- * Page Fragment:
- *  An arbitrary-length arbitrary-offset area of memory which resides
- *  within a 0 or higher order page.  Multiple fragments within that page
- *  are individually refcounted, in the page's reference counter.
+ * The page fragment allocator is simple, yet effective.  It allocates
+ * pages from the page allocator, then hands out fragments of those
+ * pages to its callers.  It makes no effort to track which parts of
+ * the page remain in use, always allocating fresh memory.  The page
+ * reference count is used to keep track of whether any fragment is
+ * still in use; when all fragments in a page have been freed, the
+ * entire page is returned to the page allocator.
  *
- * The page_frag functions below provide a simple allocation framework for
- * page fragments.  This is used by the network stack and network device
- * drivers to provide a backing region of memory for use as either an
- * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
+ * The page fragment allocator performs no locking.  The caller is
+ * expected to ensure that two callers cannot simultaneously allocate
+ * from the same page_frag_cache.  Freeing is atomic and is permitted
+ * to happen simultaneously with other frees or an allocation.
+ *
+ * The allocator uses the struct page to store its state.  The 'offset'
+ * field in struct page is used to track how far through the page the
+ * allocation has proceeded.  The 'refcount' field is used to track
+ * how many fragments have been allocated from this page.  All other
+ * fields in struct page may be used by the owner of the page_frag_cache.
+ * The refcount is incremented by one while the page is still actively being
+ * allocated from; this prevents it from being freed prematurely.
  */
-static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
-gfp_t gfp_mask)
+
+#define PAGE_FRAG_ALLOC_SIZE   (64 * 1024)
+#define PAGE_FRAG_ORDERget_order(PAGE_FRAG_ALLOC_SIZE)
+
+static noinline
+struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+   unsigned int size, gfp_t gfp_mask)
 {
+   struct page *old = nc->page;
struct page *page = NULL;
-   gfp_t gfp = gfp_mask;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
-   page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
-   PAGE_FRAG_CACHE_MAX_ORDER);
-   nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZ

Re: rfc: remove print_vma_addr ? (was Re: [PATCH 00/16] remove eight obsolete architectures)

2018-03-15 Thread Matthew Wilcox

On Thu, Mar 15, 2018 at 09:56:46AM -0700, Joe Perches wrote:
> I have a patchset that creates a vsprintf extension for
> print_vma_addr and removes all the uses similar to the
> print_symbol() removal.
> 
> This now avoids any possible printk interleaving.
> 
> Unfortunately, without some #ifdef in vsprintf, which
> I would like to avoid, it increases the nommu kernel
> size by ~500 bytes.
> 
> Anyone think this is acceptable?
> 
> Here's the overall patch, but I have it as a series
> ---
>  Documentation/core-api/printk-formats.rst |  9 +
>  arch/arm64/kernel/traps.c | 13 +++
>  arch/mips/mm/fault.c  | 16 -
>  arch/parisc/mm/fault.c| 15 
>  arch/riscv/kernel/traps.c | 11 +++---
>  arch/s390/mm/fault.c  |  7 ++--
>  arch/sparc/mm/fault_32.c  |  8 ++---
>  arch/sparc/mm/fault_64.c  |  8 ++---
>  arch/tile/kernel/signal.c |  9 ++---
>  arch/um/kernel/trap.c | 13 +++
>  arch/x86/kernel/signal.c  | 10 ++
>  arch/x86/kernel/traps.c   | 18 --
>  arch/x86/mm/fault.c   | 12 +++
>  include/linux/mm.h|  1 -
>  lib/vsprintf.c| 58 
> ++-
>  mm/memory.c   | 33 --
>  16 files changed, 112 insertions(+), 129 deletions(-)

This doesn't feel like a huge win since it's only called ~once per
architecture.  I'd be more excited if it made the printing of the whole
thing standardised; eg we have a print_fault() function in mm/memory.c
which takes a suitable set of arguments.

[PATCH] mlx5: Remove call to ida_pre_get

2018-03-14 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

The mlx5 driver calls ida_pre_get() in a loop for no readily apparent
reason.  The driver uses ida_simple_get() which will call ida_pre_get()
by itself and there's no need to use ida_pre_get() unless using
ida_get_new().

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c 
b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 10e16381f20a..3ba07c7096ef 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1647,7 +1647,6 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft,
 
list_for_each_entry(iter, match_head, list) {
nested_down_read_ref_node(>g->node, FS_LOCK_PARENT);
-   ida_pre_get(>g->fte_allocator, GFP_KERNEL);
}
 
 search_again_locked:

Re: [RFC PATCH V1 01/12] audit: add container id

2018-03-02 Thread Matthew Wilcox

On Fri, Mar 02, 2018 at 10:48:42AM -0500, Paul Moore wrote:
> On Thu, Mar 1, 2018 at 8:41 PM, Richard Guy Briggs  wrote:
> > On 2018-03-01 14:41, Richard Guy Briggs wrote:
> FYI, I think you may have a problem with something in your outgoing
> mail path; I didn't receive the original patchset you are referencing
> and it doesn't appear in the mail archive either.

I have those patches.  Which mail archive is missing them?

[GIT PULL] Fixes for IDR

2018-02-26 Thread Matthew Wilcox


Hi Linus,

I have one test-suite build fix for you and one run-time regression fix.
The regression fix includes new tests to make sure they don't pop back up.

The following changes since commit 3664ce2d930983966d2aac0e167f1332988c4e25:

  Merge tag 'powerpc-4.16-4' of 
git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux (2018-02-24 
16:05:50 -0800)

are available in the Git repository at:

  git://git.infradead.org/users/willy/linux-dax.git idr-2018-02-06

for you to fetch changes up to 4b0ad07653ee94182e2d8f21404242c9e83ad0b4:

  idr: Fix handling of IDs above INT_MAX (2018-02-26 14:39:30 -0500)


Matthew Wilcox (2):
  radix tree test suite: Fix build
  idr: Fix handling of IDs above INT_MAX

 lib/idr.c   | 13 ---
 tools/testing/radix-tree/idr-test.c | 52 +
 tools/testing/radix-tree/linux.c| 11 +-
 tools/testing/radix-tree/linux/compiler_types.h |  0
 tools/testing/radix-tree/linux/gfp.h|  1 +
 tools/testing/radix-tree/linux/slab.h   |  6 +++
 6 files changed, 75 insertions(+), 8 deletions(-)
 create mode 100644 tools/testing/radix-tree/linux/compiler_types.h

Re: [PATCH 0/2] mark some slabs as visible not mergeable

2018-02-24 Thread Matthew Wilcox

On Sat, Feb 24, 2018 at 11:04:52AM -0800, Stephen Hemminger wrote:
> This fixes an old bug in iproute2's ss command because it was
> reading slabinfo to get statistics. There isn't a better API
> to do this, and one can argue that /proc is a UAPI that must
> not change.
> 
> Therefore this patch set adds a flag to slab to give another
> reason to prevent merging, and then uses it in network code.

This is exactly the solution I would have suggested.  Note that SLUB
has always had slab merging, so this tool has been broken since 2.6.22
on any kernel with CONFIG_SLUB.

Reviewed-by: Matthew Wilcox <mawil...@microsoft.com>

Re: tcp_bind_bucket is missing from slabinfo

2018-02-24 Thread Matthew Wilcox

On Fri, Feb 23, 2018 at 10:50:30PM -0800, Stephen Hemminger wrote:
> Somewhere back around 3.17 the kmem cache "tcp_bind_bucket" dropped out
> of /proc/slabinfo. It turns out the ss command was dumpster diving
> in slabinfo to determine the number of bound sockets and now it always
> reports 0.
> 
> Not sure why, the cache is still created but it doesn't
> show in slabinfo. Could it be some part of making slab/slub common code
> (or network namespaces). The cache is created in tcp_init but not visible.
> 
> Any ideas?

Try booting with slab_nomerge=1

Re: Fwd: Re: Kernel panic with 4.16-rc1 (and 4.16-rc2) running selftest

2018-02-23 Thread Matthew Wilcox

On Sat, Feb 24, 2018 at 01:49:35AM +, Chris Mi wrote:
> To verify this patch, the following is a sanity test case:
> 
> # tc qdisc delete dev $link ingress > /dev/null 2>&1;
> # tc qdisc add dev $link ingress;
> # tc filter add dev $link prio 1 protocol ip handle 0x8001 parent : 
> flower skip_hw src_mac e4:11:0:0:0:2 dst_mac e4:12:0:0:0:2 action drop;
> # tc filter show dev $link parent :
> 
> filter pref 1 flower chain 0
> filter pref 1 flower chain 0 handle 0x8001

I added these tests to my local tree for now.

diff --git a/tools/testing/radix-tree/idr-test.c 
b/tools/testing/radix-tree/idr-test.c
index 44ef9eba5a7a..28d99325a32d 100644
--- a/tools/testing/radix-tree/idr-test.c
+++ b/tools/testing/radix-tree/idr-test.c
@@ -178,6 +178,29 @@ void idr_get_next_test(int base)
idr_destroy();
 }
 
+void idr_u32_test(struct idr *idr, int base)
+{
+   assert(idr_is_empty(idr));
+   idr_init_base(idr, base);
+   u32 handle = 10;
+   idr_alloc_u32(idr, NULL, , handle, GFP_KERNEL);
+   BUG_ON(handle != 10);
+   idr_remove(idr, handle);
+   assert(idr_is_empty(idr));
+
+   handle = 0x8001;
+   idr_alloc_u32(idr, NULL, , handle, GFP_KERNEL);
+   BUG_ON(handle != 0x8001);
+   idr_remove(idr, handle);
+   assert(idr_is_empty(idr));
+
+   handle = 0xffe0;
+   idr_alloc_u32(idr, NULL, , handle, GFP_KERNEL);
+   BUG_ON(handle != 0xffe0);
+   idr_remove(idr, handle);
+   assert(idr_is_empty(idr));
+}
+
 void idr_checks(void)
 {
unsigned long i;
@@ -248,6 +271,9 @@ void idr_checks(void)
idr_get_next_test(0);
idr_get_next_test(1);
idr_get_next_test(4);
+   idr_u32_test(, 0);
+   idr_u32_test(, 1);
+   idr_u32_test(, 4);
 }
 
 /*

Re: Fwd: Re: Kernel panic with 4.16-rc1 (and 4.16-rc2) running selftest

2018-02-23 Thread Matthew Wilcox

On Fri, Feb 23, 2018 Randy Dunlap wrote:
> [add Matthew Wilcox; hopefully he can look/see]

Thanks, Randy.  I don't understand why nobody else thought to cc the
author of the patch that it was bisected to ...

> On 02/23/2018 04:13 PM, Cong Wang wrote:
> > On Fri, Feb 23, 2018 at 3:27 PM, Cong Wang <xiyou.wangc...@gmail.com>
> wrote:
> >> On Fri, Feb 23, 2018 at 11:00 AM, Randy Dunlap <rdun...@infradead.org>
> wrote:
> >>> On 02/23/2018 08:05 AM, Khalid Aziz wrote:
> >>>> Same selftest does not cause panic on 4.15. git bisect pointed to
> commit 6ce711f2750031d12cec91384ac5cfa0a485b60a ("idr: Make 1-based IDRs
> more efficient").
> >>>> Kernel config is attached.
> >>
> >> Looks like something horribly wrong with u32 key id idr...
> >
> > Adding a few printk's, I got:
> >
> > [   31.231560] requested handle = ffe0
> > [   31.232426] allocated handle = 0
> > ...
> > [   31.246475] requested handle = ffd0
> > [   31.247555] allocated handle = 1
> >
> >
> > So the bug is here where we can't allocate a specific handle:
> >
> > err = idr_alloc_u32(_c->handle_idr, ht,
> ,
> > handle, GFP_KERNEL);
> > if (err) {
> > kfree(ht);
> > return err;
> > }

Please try this patch.  It fixes ffe0, but there may be more things
tested that it may not work for.

Chris Mi, what happened to that set of testcases you promised to write
for me?

diff --git a/lib/idr.c b/lib/idr.c
index c98d77fcf393..10d9b8d47c33 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -36,8 +36,8 @@ int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid,
 {
struct radix_tree_iter iter;
void __rcu **slot;
-   int base = idr->idr_base;
-   int id = *nextid;
+   unsigned int base = idr->idr_base;
+   unsigned int id = *nextid;
 
if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
return -EINVAL;

Re: kmem_cache_attr (was Re: [PATCH 04/36] usercopy: Prepare for usercopy whitelisting)

2018-01-16 Thread Matthew Wilcox

On Tue, Jan 16, 2018 at 12:17:01PM -0600, Christopher Lameter wrote:
> Draft patch of how the data structs could change. kmem_cache_attr is read
> only.

Looks good.  Although I would add Kees' user feature:

struct kmem_cache_attr {
char name[16];
unsigned int size;
unsigned int align;
+   unsigned int useroffset;
+   unsigned int usersize;
slab_flags_t flags;
kmem_cache_ctor ctor;
}

And I'd start with 
+struct kmem_cache *kmem_cache_create_attr(const kmem_cache_attr *);

leaving the old kmem_cache_create to kmalloc a kmem_cache_attr and
initialise it.

Can we also do something like this?

-#define KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\
-   sizeof(struct __struct), __alignof__(struct __struct),\
-   (__flags), NULL)
+#define KMEM_CACHE(__struct, __flags) ({   \
+   const struct kmem_cache_attr kca ## __stringify(__struct) = {   \
+   .name = #__struct,  \
+   .size = sizeof(struct __struct),\
+   .align = __alignof__(struct __struct),  \
+   .flags = (__flags), \
+   };  \
+   kmem_cache_create_attr( ## __stringify(__struct));  \
+})

That way we won't need to convert any of those users.

Re: kmem_cache_attr (was Re: [PATCH 04/36] usercopy: Prepare for usercopy whitelisting)

2018-01-16 Thread Matthew Wilcox

On Tue, Jan 16, 2018 at 10:54:27AM -0600, Christopher Lameter wrote:
> On Tue, 16 Jan 2018, Matthew Wilcox wrote:
> 
> > I think that's a good thing!  /proc/slabinfo really starts to get grotty
> > above 16 bytes.  I'd like to chop off "_cache" from the name of every
> > single slab!  If ext4_allocation_context has to become ext4_alloc_ctx,
> > I don't think we're going to lose any valuable information.
> 
> Ok so we are going to cut off at 16 charaacters? Sounds good to me.

Excellent!

> > > struct kmem_cache_attr {
> > >   char *name;
> > >   size_t size;
> > >   size_t align;
> > >   slab_flags_t flags;
> > >   unsigned int useroffset;
> > >   unsinged int usersize;
> > >   void (*ctor)(void *);
> > >   kmem_isolate_func *isolate;
> > >   kmem_migrate_func *migrate;
> > >   ...
> > > }
> >
> > In these slightly-more-security-conscious days, it's considered poor
> > practice to have function pointers in writable memory.  That was why
> > I wanted to make the kmem_cache_attr const.
> 
> Sure this data is never changed. It can be const.

It's changed at initialisation.  Look:

kmem_cache_create(const char *name, size_t size, size_t align,
  slab_flags_t flags, void (*ctor)(void *))
s = create_cache(cache_name, size, size,
 calculate_alignment(flags, align, size),
 flags, ctor, NULL, NULL);

The 'align' that ends up in s->align, is not the user-specified align.
It's also dependent on runtime information (cache_line_size()), so it
can't be calculated at compile time.

'flags' also gets mangled:
flags &= CACHE_CREATE_MASK;


> I am not married to either way of specifying the sizes. unsigned int would
> be fine with me. SLUB falls back to the page allocator anyways for
> anything above 2* PAGE_SIZE and I think we can do the same for the other
> allocators as well. Zeroing or initializing such a large memory chunk is
> much more expensive than the allocation so it does not make much sense to
> have that directly supported in the slab allocators.

The only slabs larger than 4kB on my system right now are:
kvm_vcpu   0  0  1913618 : tunables840 : 
slabdata  0  0  0
net_namespace  1  1   608012 : tunables840 : 
slabdata  1  1  0

(other than the fake slabs for kmalloc)

> Some platforms support 64K page size and I could envision a 2M page size
> at some point. So I think we cannot use 16 bits there.
> 
> If no one objects then I can use unsigned int there again.

unsigned int would be my preference.

Re: kmem_cache_attr (was Re: [PATCH 04/36] usercopy: Prepare for usercopy whitelisting)

2018-01-16 Thread Matthew Wilcox

On Tue, Jan 16, 2018 at 09:21:30AM -0600, Christopher Lameter wrote:
> > struct kmem_cache_attr {
> > const char name[32];
> 
> Want to avoid the string reference mess that occurred in the past?
> Is that really necessary? But it would limit the size of the name.

I think that's a good thing!  /proc/slabinfo really starts to get grotty
above 16 bytes.  I'd like to chop off "_cache" from the name of every
single slab!  If ext4_allocation_context has to become ext4_alloc_ctx,
I don't think we're going to lose any valuable information.

My real intent was to reduce the number of allocations; if we can make
it not necessary to kstrdup the name, I think that'd be appreciated by
our CONFIG_TINY friends.

> > (my rationale is that everything in attr should be const, but size, align
> > and flags all get modified by the slab code).
> 
> Thought about putting all the parameters into the kmem_cache_attr struct.
> 
> So
> 
> struct kmem_cache_attr {
>   char *name;
>   size_t size;
>   size_t align;
>   slab_flags_t flags;
>   unsigned int useroffset;
>   unsinged int usersize;
>   void (*ctor)(void *);
>   kmem_isolate_func *isolate;
>   kmem_migrate_func *migrate;
>   ...
> }

In these slightly-more-security-conscious days, it's considered poor
practice to have function pointers in writable memory.  That was why
I wanted to make the kmem_cache_attr const.

Also, there's no need for 'size' and 'align' to be size_t.  Slab should
never support allocations above 4GB in size.  I'm not even keen on seeing
allocations above 64kB, but I see my laptop has six 512kB allocations (!),
three 256kB allocations and seven 128kB allocations, so I must reluctantly
concede that using an unsigned int is necessary.  If I were really into
bitshaving, I might force all allocations to be a multiple of 32-bytes
in size, and then we could use 16 bits to represent an allocation between
32 and 2MB, but I think that tips us beyond the complexity boundary.

Re: [PATCH 04/36] usercopy: Prepare for usercopy whitelisting

2018-01-14 Thread Matthew Wilcox

On Wed, Jan 10, 2018 at 12:28:23PM -0600, Christopher Lameter wrote:
> On Tue, 9 Jan 2018, Kees Cook wrote:
> > +struct kmem_cache *kmem_cache_create_usercopy(const char *name,
> > +   size_t size, size_t align, slab_flags_t flags,
> > +   size_t useroffset, size_t usersize,
> > +   void (*ctor)(void *));
> 
> Hmmm... At some point we should switch kmem_cache_create to pass a struct
> containing all the parameters. Otherwise the API will blow up with
> additional functions.

Obviously I agree with you.  I'm inclined to not let that delay Kees'
patches; we can fix the few places that use this API later.  At this
point, my proposal for the ultimate form would be:

struct kmem_cache_attr {
const char name[32];
void (*ctor)(void *);
unsigned int useroffset;
unsigned int user_size; 
};

kmem_create_cache_attr(const struct kmem_cache_attr *attr, unsigned int size,
unsigned int align, slab_flags_t flags)

(my rationale is that everything in attr should be const, but size, align
and flags all get modified by the slab code).

Re: [PATCH 13/38] ext4: Define usercopy region in ext4_inode_cache slab cache

2018-01-14 Thread Matthew Wilcox

On Thu, Jan 11, 2018 at 03:05:14PM -0800, Kees Cook wrote:
> On Thu, Jan 11, 2018 at 9:01 AM, Theodore Ts'o  wrote:
> > On Wed, Jan 10, 2018 at 06:02:45PM -0800, Kees Cook wrote:
> >> The ext4 symlink pathnames, stored in struct ext4_inode_info.i_data
> >> and therefore contained in the ext4_inode_cache slab cache, need
> >> to be copied to/from userspace.
> >
> > Symlink operations to/from userspace aren't common or in the hot path,
> > and when they are in i_data, limited to at most 60 bytes.  Is it worth
> > it to copy through a bounce buffer so as to disallow any usercopies
> > into struct ext4_inode_info?
> 
> If this is the only place it's exposed, yeah, that might be a way to
> avoid the per-FS patches. This would, AIUI, require changing
> readlink_copy() to include a bounce buffer, and that would require an
> allocation. I kind of prefer just leaving the per-FS whitelists, as
> then there's no global overhead added.

I think Ted was proposing having a per-FS patch that would, say, copy
up to 60 bytes to the stack, then memcpy it into the ext4_inode_info.

[PATCH v2 01/17] idr: Fix build

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

The IDR calls WARN_ON without including 

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 7c3a365f7e12..dd048cf456b7 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -13,6 +13,7 @@
 #define __IDR_H__
 
 #include 
+#include 
 #include 
 #include 
 
-- 
2.15.0

[PATCH v2 07/17] idr: Delete idr_find_ext function

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Simply changing idr_remove's 'id' argument to 'unsigned long' works
for all callers.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h| 7 +--
 net/sched/act_api.c| 2 +-
 net/sched/cls_flower.c | 2 +-
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 90dbe7a3735c..12514ec0cd28 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -179,16 +179,11 @@ static inline void idr_preload_end(void)
  * This function can be called under rcu_read_lock(), given that the leaf
  * pointers lifetimes are correctly managed.
  */
-static inline void *idr_find_ext(const struct idr *idr, unsigned long id)
+static inline void *idr_find(const struct idr *idr, unsigned long id)
 {
return radix_tree_lookup(>idr_rt, id);
 }
 
-static inline void *idr_find(const struct idr *idr, int id)
-{
-   return idr_find_ext(idr, id);
-}
-
 /**
  * idr_for_each_entry - iterate over an idr's elements of a given type
  * @idr: idr handle
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 7e901e855d68..efb90b8a3bf0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -222,7 +222,7 @@ static struct tc_action *tcf_idr_lookup(u32 index, struct 
tcf_idrinfo *idrinfo)
struct tc_action *p = NULL;
 
spin_lock_bh(>lock);
-   p = idr_find_ext(>action_idr, index);
+   p = idr_find(>action_idr, index);
spin_unlock_bh(>lock);
 
return p;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index ca71823bee03..ec0dc92f6104 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -329,7 +329,7 @@ static void *fl_get(struct tcf_proto *tp, u32 handle)
 {
struct cls_fl_head *head = rtnl_dereference(tp->root);
 
-   return idr_find_ext(>handle_idr, handle);
+   return idr_find(>handle_idr, handle);
 }
 
 static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
-- 
2.15.0

[PATCH v2 17/17] idr: Warn if old iterators see large IDs

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Now that the IDR can be used to store large IDs, it is possible somebody
might only partially convert their old code and use the iterators which
can only handle IDs up to INT_MAX.  It's probably unwise to show them a
truncated ID, so settle for spewing warnings to dmesg, and terminating
the iteration.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 lib/idr.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lib/idr.c b/lib/idr.c
index 772a24513d1e..1aaeb5a8c426 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -145,7 +145,11 @@ int idr_for_each(const struct idr *idr,
void __rcu **slot;
 
radix_tree_for_each_slot(slot, >idr_rt, , 0) {
-   int ret = fn(iter.index, rcu_dereference_raw(*slot), data);
+   int ret;
+
+   if (WARN_ON_ONCE(iter.index > INT_MAX))
+   break;
+   ret = fn(iter.index, rcu_dereference_raw(*slot), data);
if (ret)
return ret;
}
@@ -173,6 +177,9 @@ void *idr_get_next(struct idr *idr, int *nextid)
if (!slot)
return NULL;
 
+   if (WARN_ON_ONCE(iter.index > INT_MAX))
+   return NULL;
+
*nextid = iter.index;
return rcu_dereference_raw(*slot);
 }
-- 
2.15.0

[PATCH v2 16/17] idr: Rename idr_for_each_entry_ext

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Match idr_alloc_ul with idr_get_next_ul and idr_for_each_entry_ul.
Also add kernel-doc.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h | 17 ++---
 lib/idr.c   | 20 +++-
 net/sched/act_api.c |  6 +++---
 3 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 344380fd0887..91d27a9bcdf4 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -88,7 +88,7 @@ int idr_alloc_cyclic(struct idr *, void *entry, int start, 
int end, gfp_t);
 int idr_for_each(const struct idr *,
 int (*fn)(int id, void *p, void *data), void *data);
 void *idr_get_next(struct idr *, int *nextid);
-void *idr_get_next_ext(struct idr *idr, unsigned long *nextid);
+void *idr_get_next_ul(struct idr *, unsigned long *nextid);
 void *idr_replace(struct idr *, void *, unsigned long id);
 void idr_destroy(struct idr *);
 
@@ -178,8 +178,19 @@ static inline void *idr_find(const struct idr *idr, 
unsigned long id)
  */
 #define idr_for_each_entry(idr, entry, id) \
for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; ++id)
-#define idr_for_each_entry_ext(idr, entry, id) \
-   for (id = 0; ((entry) = idr_get_next_ext(idr, &(id))) != NULL; ++id)
+
+/**
+ * idr_for_each_entry_ul() - iterate over an IDR's elements of a given type.
+ * @idr: IDR handle.
+ * @entry: The type * to use as cursor.
+ * @id: Entry ID.
+ *
+ * @entry and @id do not need to be initialized before the loop, and
+ * after normal terminatinon @entry is left with the value NULL.  This
+ * is convenient for a "not found" value.
+ */
+#define idr_for_each_entry_ul(idr, entry, id)  \
+   for (id = 0; ((entry) = idr_get_next_ul(idr, &(id))) != NULL; ++id)
 
 /**
  * idr_for_each_entry_continue - continue iteration over an idr's elements of 
a given type
diff --git a/lib/idr.c b/lib/idr.c
index 103afb97b4bd..772a24513d1e 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -155,9 +155,9 @@ int idr_for_each(const struct idr *idr,
 EXPORT_SYMBOL(idr_for_each);
 
 /**
- * idr_get_next - Find next populated entry
- * @idr: idr handle
- * @nextid: Pointer to lowest possible ID to return
+ * idr_get_next() - Find next populated entry.
+ * @idr: IDR handle.
+ * @nextid: Pointer to lowest possible ID to return.
  *
  * Returns the next populated entry in the tree with an ID greater than
  * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
@@ -178,7 +178,17 @@ void *idr_get_next(struct idr *idr, int *nextid)
 }
 EXPORT_SYMBOL(idr_get_next);
 
-void *idr_get_next_ext(struct idr *idr, unsigned long *nextid)
+/**
+ * idr_get_next_ul() - Find next populated entry.
+ * @idr: IDR handle.
+ * @nextid: Pointer to lowest possible ID to return.
+ *
+ * Returns the next populated entry in the tree with an ID greater than
+ * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
+ * to the ID of the found value.  To use in a loop, the value pointed to by
+ * nextid must be incremented by the user.
+ */
+void *idr_get_next_ul(struct idr *idr, unsigned long *nextid)
 {
struct radix_tree_iter iter;
void __rcu **slot;
@@ -190,7 +200,7 @@ void *idr_get_next_ext(struct idr *idr, unsigned long 
*nextid)
*nextid = iter.index;
return rcu_dereference_raw(*slot);
 }
-EXPORT_SYMBOL(idr_get_next_ext);
+EXPORT_SYMBOL(idr_get_next_ul);
 
 /**
  * idr_replace - replace pointer for given id
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 156302c110af..4133d91b7029 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -124,7 +124,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, 
struct sk_buff *skb,
 
s_i = cb->args[0];
 
-   idr_for_each_entry_ext(idr, p, id) {
+   idr_for_each_entry_ul(idr, p, id) {
index++;
if (index < s_i)
continue;
@@ -181,7 +181,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, 
struct sk_buff *skb,
if (nla_put_string(skb, TCA_KIND, ops->kind))
goto nla_put_failure;
 
-   idr_for_each_entry_ext(idr, p, id) {
+   idr_for_each_entry_ul(idr, p, id) {
ret = __tcf_idr_release(p, false, true);
if (ret == ACT_P_DELETED) {
module_put(ops->owner);
@@ -351,7 +351,7 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
int ret;
unsigned long id = 1;
 
-   idr_for_each_entry_ext(idr, p, id) {
+   idr_for_each_entry_ul(idr, p, id) {
ret = __tcf_idr_release(p, false, true);
if (ret == ACT_P_DELETED)
module_put(ops->owner);
-- 
2.15.0

[PATCH v2 13/17] cls_u32: Reinstate cyclic allocation

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Commit e7614370d6f0 ("net_sched: use idr to allocate u32 filter handles)
converted htid allocation to use the IDR.  The ID allocated by this
scheme changes; it used to be cyclic, but now always allocates the
lowest available.  The IDR supports cyclic allocation, so just use
the right function.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 net/sched/cls_u32.c | 14 --
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 9d48674a70e0..e65b47483dc0 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -316,19 +316,13 @@ static void *u32_get(struct tcf_proto *tp, u32 handle)
return u32_lookup_key(ht, handle);
 }
 
+/* Protected by rtnl lock */
 static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr)
 {
-   unsigned long idr_index;
-   int err;
-
-   /* This is only used inside rtnl lock it is safe to increment
-* without read _copy_ update semantics
-*/
-   err = idr_alloc_ext(_c->handle_idr, ptr, _index,
-   1, 0x7FF, GFP_KERNEL);
-   if (err)
+   int id = idr_alloc_cyclic(_c->handle_idr, ptr, 1, 0x7FF, GFP_KERNEL);
+   if (id < 0)
return 0;
-   return (u32)(idr_index | 0x800) << 20;
+   return (id | 0x800U) << 20;
 }
 
 static struct hlist_head *tc_u_common_hash;
-- 
2.15.0

[PATCH v2 10/17] cls_basic: Convert to use idr_alloc_u32

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Use the new helper which saves a temporary variable and a few lines of
code.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 net/sched/cls_basic.c | 25 ++---
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 147700afcf31..c4b242fee8e4 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -182,7 +182,6 @@ static int basic_change(struct net *net, struct sk_buff 
*in_skb,
struct nlattr *tb[TCA_BASIC_MAX + 1];
struct basic_filter *fold = (struct basic_filter *) *arg;
struct basic_filter *fnew;
-   unsigned long idr_index;
 
if (tca[TCA_OPTIONS] == NULL)
return -EINVAL;
@@ -205,21 +204,17 @@ static int basic_change(struct net *net, struct sk_buff 
*in_skb,
if (err < 0)
goto errout;
 
-   if (handle) {
-   fnew->handle = handle;
-   if (!fold) {
-   err = idr_alloc_ext(>handle_idr, fnew, _index,
-   handle, handle + 1, GFP_KERNEL);
-   if (err)
-   goto errout;
-   }
-   } else {
-   err = idr_alloc_ext(>handle_idr, fnew, _index,
-   1, 0x7FFF, GFP_KERNEL);
-   if (err)
-   goto errout;
-   fnew->handle = idr_index;
+   if (!handle) {
+   handle = 1;
+   err = idr_alloc_u32(>handle_idr, fnew, ,
+   INT_MAX, GFP_KERNEL);
+   } else if (!fold) {
+   err = idr_alloc_u32(>handle_idr, fnew, ,
+   handle, GFP_KERNEL);
}
+   if (err)
+   goto errout;
+   fnew->handle = handle;
 
err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr);
if (err < 0) {
-- 
2.15.0

[PATCH v2 12/17] cls_flower: Convert to idr_alloc_u32

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Use the new helper which saves a temporary variable and a few lines
of code.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 net/sched/cls_flower.c | 26 ++
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index ec0dc92f6104..adee3cf30bb3 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -858,7 +858,6 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
struct cls_fl_filter *fnew;
struct nlattr **tb;
struct fl_flow_mask mask = {};
-   unsigned long idr_index;
int err;
 
if (!tca[TCA_OPTIONS])
@@ -889,21 +888,17 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
goto errout;
 
if (!handle) {
-   err = idr_alloc_ext(>handle_idr, fnew, _index,
-   1, 0x8000, GFP_KERNEL);
-   if (err)
-   goto errout;
-   fnew->handle = idr_index;
-   }
-
-   /* user specifies a handle and it doesn't exist */
-   if (handle && !fold) {
-   err = idr_alloc_ext(>handle_idr, fnew, _index,
-   handle, handle + 1, GFP_KERNEL);
-   if (err)
-   goto errout;
-   fnew->handle = idr_index;
+   handle = 1;
+   err = idr_alloc_u32(>handle_idr, fnew, ,
+   INT_MAX, GFP_KERNEL);
+   } else if (!fold) {
+   /* user specifies a handle and it doesn't exist */
+   err = idr_alloc_u32(>handle_idr, fnew, ,
+   handle, GFP_KERNEL);
}
+   if (err)
+   goto errout;
+   fnew->handle = handle;
 
if (tb[TCA_FLOWER_FLAGS]) {
fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
@@ -957,7 +952,6 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
*arg = fnew;
 
if (fold) {
-   fnew->handle = handle;
idr_replace(>handle_idr, fnew, fnew->handle);
list_replace_rcu(>list, >list);
tcf_unbind_filter(tp, >res);
-- 
2.15.0

[PATCH v2 11/17] cls_bpf: Convert to use idr_alloc_u32

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Use the new helper.  This has a modest reduction in both lines of code
and compiled code size.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 net/sched/cls_bpf.c | 24 ++--
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 1660fc8294ef..db1dd4de7d6a 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -472,7 +472,6 @@ static int cls_bpf_change(struct net *net, struct sk_buff 
*in_skb,
struct cls_bpf_prog *oldprog = *arg;
struct nlattr *tb[TCA_BPF_MAX + 1];
struct cls_bpf_prog *prog;
-   unsigned long idr_index;
int ret;
 
if (tca[TCA_OPTIONS] == NULL)
@@ -499,21 +498,18 @@ static int cls_bpf_change(struct net *net, struct sk_buff 
*in_skb,
}
 
if (handle == 0) {
-   ret = idr_alloc_ext(>handle_idr, prog, _index,
-   1, 0x7FFF, GFP_KERNEL);
-   if (ret)
-   goto errout;
-   prog->handle = idr_index;
-   } else {
-   if (!oldprog) {
-   ret = idr_alloc_ext(>handle_idr, prog, _index,
-   handle, handle + 1, GFP_KERNEL);
-   if (ret)
-   goto errout;
-   }
-   prog->handle = handle;
+   handle = 1;
+   ret = idr_alloc_u32(>handle_idr, prog, ,
+   INT_MAX, GFP_KERNEL);
+   } else if (!oldprog) {
+   ret = idr_alloc_u32(>handle_idr, prog, ,
+   handle, GFP_KERNEL);
}
 
+   if (ret)
+   goto errout;
+   prog->handle = handle;
+
ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
if (ret < 0)
goto errout_idr;
-- 
2.15.0

[PATCH v2 06/17] idr: Delete idr_replace_ext function

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Changing idr_replace's 'id' argument to 'unsigned long' works for all
callers.  Callers which passed a negative ID now get -ENOENT instead of
-EINVAL.  No callers relied on this error value.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h|  3 +--
 lib/idr.c  | 15 +++
 net/sched/act_api.c|  2 +-
 net/sched/cls_basic.c  |  2 +-
 net/sched/cls_bpf.c|  2 +-
 net/sched/cls_flower.c |  2 +-
 net/sched/cls_u32.c|  2 +-
 7 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 9a4042489ec6..90dbe7a3735c 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -136,8 +136,7 @@ int idr_for_each(const struct idr *,
 int (*fn)(int id, void *p, void *data), void *data);
 void *idr_get_next(struct idr *, int *nextid);
 void *idr_get_next_ext(struct idr *idr, unsigned long *nextid);
-void *idr_replace(struct idr *, void *, int id);
-void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id);
+void *idr_replace(struct idr *, void *, unsigned long id);
 void idr_destroy(struct idr *);
 
 static inline void *idr_remove(struct idr *idr, unsigned long id)
diff --git a/lib/idr.c b/lib/idr.c
index 2593ce513a18..577bfd4fe5c2 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -147,18 +147,9 @@ EXPORT_SYMBOL(idr_get_next_ext);
  * the one being replaced!).
  *
  * Returns: the old value on success.  %-ENOENT indicates that @id was not
- * found.  %-EINVAL indicates that @id or @ptr were not valid.
+ * found.  %-EINVAL indicates that @ptr was not valid.
  */
-void *idr_replace(struct idr *idr, void *ptr, int id)
-{
-   if (id < 0)
-   return ERR_PTR(-EINVAL);
-
-   return idr_replace_ext(idr, ptr, id);
-}
-EXPORT_SYMBOL(idr_replace);
-
-void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id)
+void *idr_replace(struct idr *idr, void *ptr, unsigned long id)
 {
struct radix_tree_node *node;
void __rcu **slot = NULL;
@@ -175,7 +166,7 @@ void *idr_replace_ext(struct idr *idr, void *ptr, unsigned 
long id)
 
return entry;
 }
-EXPORT_SYMBOL(idr_replace_ext);
+EXPORT_SYMBOL(idr_replace);
 
 /**
  * DOC: IDA description
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index bab81574a420..7e901e855d68 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -348,7 +348,7 @@ void tcf_idr_insert(struct tc_action_net *tn, struct 
tc_action *a)
struct tcf_idrinfo *idrinfo = tn->idrinfo;
 
spin_lock_bh(>lock);
-   idr_replace_ext(>action_idr, a, a->tcfa_index);
+   idr_replace(>action_idr, a, a->tcfa_index);
spin_unlock_bh(>lock);
 }
 EXPORT_SYMBOL(tcf_idr_insert);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index d2193304bad0..147700afcf31 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -231,7 +231,7 @@ static int basic_change(struct net *net, struct sk_buff 
*in_skb,
*arg = fnew;
 
if (fold) {
-   idr_replace_ext(>handle_idr, fnew, fnew->handle);
+   idr_replace(>handle_idr, fnew, fnew->handle);
list_replace_rcu(>link, >link);
tcf_unbind_filter(tp, >res);
tcf_exts_get_net(>exts);
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index b017d99fd7e1..1660fc8294ef 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -526,7 +526,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff 
*in_skb,
prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW;
 
if (oldprog) {
-   idr_replace_ext(>handle_idr, prog, handle);
+   idr_replace(>handle_idr, prog, handle);
list_replace_rcu(>link, >link);
tcf_unbind_filter(tp, >res);
tcf_exts_get_net(>exts);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 3e89b0be1706..ca71823bee03 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -958,7 +958,7 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
 
if (fold) {
fnew->handle = handle;
-   idr_replace_ext(>handle_idr, fnew, fnew->handle);
+   idr_replace(>handle_idr, fnew, fnew->handle);
list_replace_rcu(>list, >list);
tcf_unbind_filter(tp, >res);
tcf_exts_get_net(>exts);
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 6fe4e3549ad3..9d48674a70e0 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -833,7 +833,7 @@ static void u32_replace_knode(struct tcf_proto *tp, struct 
tc_u_common *tp_c,
if (pins->handle == n->handle)
break;
 
-   idr_replace_ext(>handle_idr, n, n->handle);
+   idr_replace(>handle_idr, n, n->handle);
RCU_INIT_POINTER(n->next, pins->next);
rcu_assign_pointer(*ins, n);
 }
-- 
2.15.0

[PATCH v2 08/17] idr: Add idr_alloc_u32 helper

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

All current users of idr_alloc_ext() actually want to allocate a u32 and
it's a little painful for them to use idr_alloc_ext().  This convenience
function makes it simple.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h | 29 +
 1 file changed, 29 insertions(+)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 12514ec0cd28..9b2fd6f408b2 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -139,6 +139,35 @@ void *idr_get_next_ext(struct idr *idr, unsigned long 
*nextid);
 void *idr_replace(struct idr *, void *, unsigned long id);
 void idr_destroy(struct idr *);
 
+/**
+ * idr_alloc_u32() - Allocate an ID.
+ * @idr: IDR handle.
+ * @ptr: Pointer to be associated with the new ID.
+ * @nextid: The new ID.
+ * @max: The maximum ID to allocate (inclusive).
+ * @gfp: Memory allocation flags.
+ *
+ * Allocates an unused ID in the range [*nextid, max] and updates the @nextid
+ * pointer with the newly assigned ID.  Returns -ENOSPC and does not modify
+ * @nextid if there are no unused IDs in the range.
+ *
+ * The caller should provide their own locking to ensure that two concurrent
+ * modifications to the IDR are not possible.  Read-only accesses to the
+ * IDR may be done under the RCU read lock or may exclude simultaneous
+ * writers.
+ *
+ * Return: 0 on success, -ENOMEM for memory allocation errors, -ENOSPC if
+ * there are no free IDs in the range.
+ */
+static inline int __must_check idr_alloc_u32(struct idr *idr, void *ptr,
+   u32 *nextid, unsigned long max, gfp_t gfp)
+{
+   unsigned long tmp = *nextid;
+   int ret = idr_alloc_ext(idr, ptr, , tmp, max + 1, gfp);
+   *nextid = tmp;
+   return ret;
+}
+
 static inline void *idr_remove(struct idr *idr, unsigned long id)
 {
return radix_tree_delete_item(>idr_rt, id, NULL);
-- 
2.15.0

[PATCH v2 15/17] idr: Rename idr_alloc_ext to idr_alloc_ul

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

idr_alloc_ul fits better with other parts of the Linux kernel where we
need to name a function based on the types it operates on.

It uses a 'nextid' pointer argument instead of separate minimum ID and
output assigned ID, (like idr_get_next), reducing the number of arguments
by one.  It also takes a 'max' argument rather than an 'end' argument
(unlike idr_alloc, but the semantics of 'end' don't work for unsigned long
arguments).  And its return value is an errno, so mark it as __must_check.

Includes kernel-doc for idr_alloc_ul, which idr_alloc_ext didn't have,
and I realised we were missing a test-case where idr_alloc_cyclic wraps
around INT_MAX.  Chris Mi <chr...@mellanox.com> has promised to contribute
test-cases for idr_alloc_ul.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h | 55 ++---
 include/linux/radix-tree.h  | 17 +--
 lib/idr.c   | 99 +
 lib/radix-tree.c|  3 +-
 net/sched/cls_u32.c | 20 
 tools/testing/radix-tree/idr-test.c | 17 +++
 6 files changed, 111 insertions(+), 100 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 9b2fd6f408b2..344380fd0887 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -13,7 +13,6 @@
 #define __IDR_H__
 
 #include 
-#include 
 #include 
 #include 
 
@@ -82,55 +81,9 @@ static inline void idr_set_cursor(struct idr *idr, unsigned 
int val)
 
 void idr_preload(gfp_t gfp_mask);
 
-int idr_alloc_cmn(struct idr *idr, void *ptr, unsigned long *index,
- unsigned long start, unsigned long end, gfp_t gfp,
- bool ext);
-
-/**
- * idr_alloc - allocate an id
- * @idr: idr handle
- * @ptr: pointer to be associated with the new id
- * @start: the minimum id (inclusive)
- * @end: the maximum id (exclusive)
- * @gfp: memory allocation flags
- *
- * Allocates an unused ID in the range [start, end).  Returns -ENOSPC
- * if there are no unused IDs in that range.
- *
- * Note that @end is treated as max when <= 0.  This is to always allow
- * using @start + N as @end as long as N is inside integer range.
- *
- * Simultaneous modifications to the @idr are not allowed and should be
- * prevented by the user, usually with a lock.  idr_alloc() may be called
- * concurrently with read-only accesses to the @idr, such as idr_find() and
- * idr_for_each_entry().
- */
-static inline int idr_alloc(struct idr *idr, void *ptr,
-   int start, int end, gfp_t gfp)
-{
-   unsigned long id;
-   int ret;
-
-   if (WARN_ON_ONCE(start < 0))
-   return -EINVAL;
-
-   ret = idr_alloc_cmn(idr, ptr, , start, end, gfp, false);
-
-   if (ret)
-   return ret;
-
-   return id;
-}
-
-static inline int idr_alloc_ext(struct idr *idr, void *ptr,
-   unsigned long *index,
-   unsigned long start,
-   unsigned long end,
-   gfp_t gfp)
-{
-   return idr_alloc_cmn(idr, ptr, index, start, end, gfp, true);
-}
-
+int idr_alloc(struct idr *, void *, int start, int end, gfp_t);
+int __must_check idr_alloc_ul(struct idr *, void *, unsigned long *nextid,
+   unsigned long max, gfp_t);
 int idr_alloc_cyclic(struct idr *, void *entry, int start, int end, gfp_t);
 int idr_for_each(const struct idr *,
 int (*fn)(int id, void *p, void *data), void *data);
@@ -163,7 +116,7 @@ static inline int __must_check idr_alloc_u32(struct idr 
*idr, void *ptr,
u32 *nextid, unsigned long max, gfp_t gfp)
 {
unsigned long tmp = *nextid;
-   int ret = idr_alloc_ext(idr, ptr, , tmp, max + 1, gfp);
+   int ret = idr_alloc_ul(idr, ptr, , max, gfp);
*nextid = tmp;
return ret;
 }
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 23a9c89c7ad9..fc55ff31eca7 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -356,24 +356,9 @@ int radix_tree_split(struct radix_tree_root *, unsigned 
long index,
 int radix_tree_join(struct radix_tree_root *, unsigned long index,
unsigned new_order, void *);
 
-void __rcu **idr_get_free_cmn(struct radix_tree_root *root,
+void __rcu **idr_get_free(struct radix_tree_root *root,
  struct radix_tree_iter *iter, gfp_t gfp,
  unsigned long max);
-static inline void __rcu **idr_get_free(struct radix_tree_root *root,
-   struct radix_tree_iter *iter,
-   gfp_t gfp,
-   int end)
-{
-   return idr_get_free_cmn(root, iter, gfp, end > 0 ? end - 1 : INT_MAX);
-}
-
-static inline void __rc

[PATCH v2 14/17] cls_u32: Convert to idr_alloc_u32

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

No real benefit to this classifier, but since we're allocating a u32
anyway, we should use this function.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 net/sched/cls_u32.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index e65b47483dc0..e433d1adccc8 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -970,8 +970,8 @@ static int u32_change(struct net *net, struct sk_buff 
*in_skb,
return -ENOMEM;
}
} else {
-   err = idr_alloc_ext(_c->handle_idr, ht, NULL,
-   handle, handle + 1, GFP_KERNEL);
+   err = idr_alloc_u32(_c->handle_idr, ht, ,
+   handle, GFP_KERNEL);
if (err) {
kfree(ht);
return err;
@@ -1020,8 +1020,7 @@ static int u32_change(struct net *net, struct sk_buff 
*in_skb,
if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
return -EINVAL;
handle = htid | TC_U32_NODE(handle);
-   err = idr_alloc_ext(>handle_idr, NULL, NULL,
-   handle, handle + 1,
+   err = idr_alloc_u32(>handle_idr, NULL, , handle,
GFP_KERNEL);
if (err)
return err;
-- 
2.15.0

[PATCH v2 05/17] idr: Delete idr_remove_ext function

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Simply changing idr_remove's 'id' argument to 'unsigned long' suffices
for all callers.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h| 7 +--
 net/sched/act_api.c| 2 +-
 net/sched/cls_basic.c  | 6 +++---
 net/sched/cls_bpf.c| 4 ++--
 net/sched/cls_flower.c | 4 ++--
 net/sched/cls_u32.c| 8 
 6 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index dd048cf456b7..9a4042489ec6 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -140,16 +140,11 @@ void *idr_replace(struct idr *, void *, int id);
 void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id);
 void idr_destroy(struct idr *);
 
-static inline void *idr_remove_ext(struct idr *idr, unsigned long id)
+static inline void *idr_remove(struct idr *idr, unsigned long id)
 {
return radix_tree_delete_item(>idr_rt, id, NULL);
 }
 
-static inline void *idr_remove(struct idr *idr, int id)
-{
-   return idr_remove_ext(idr, id);
-}
-
 static inline void idr_init(struct idr *idr)
 {
INIT_RADIX_TREE(>idr_rt, IDR_RT_MARKER);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 4d33a50a8a6d..bab81574a420 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -78,7 +78,7 @@ static void free_tcf(struct tc_action *p)
 static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p)
 {
spin_lock_bh(>lock);
-   idr_remove_ext(>action_idr, p->tcfa_index);
+   idr_remove(>action_idr, p->tcfa_index);
spin_unlock_bh(>lock);
gen_kill_estimator(>tcfa_rate_est);
free_tcf(p);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 5f169ded347e..d2193304bad0 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -120,7 +120,7 @@ static void basic_destroy(struct tcf_proto *tp)
list_for_each_entry_safe(f, n, >flist, link) {
list_del_rcu(>link);
tcf_unbind_filter(tp, >res);
-   idr_remove_ext(>handle_idr, f->handle);
+   idr_remove(>handle_idr, f->handle);
if (tcf_exts_get_net(>exts))
call_rcu(>rcu, basic_delete_filter);
else
@@ -137,7 +137,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, 
bool *last)
 
list_del_rcu(>link);
tcf_unbind_filter(tp, >res);
-   idr_remove_ext(>handle_idr, f->handle);
+   idr_remove(>handle_idr, f->handle);
tcf_exts_get_net(>exts);
call_rcu(>rcu, basic_delete_filter);
*last = list_empty(>flist);
@@ -224,7 +224,7 @@ static int basic_change(struct net *net, struct sk_buff 
*in_skb,
err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr);
if (err < 0) {
if (!fold)
-   idr_remove_ext(>handle_idr, fnew->handle);
+   idr_remove(>handle_idr, fnew->handle);
goto errout;
}
 
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 6fe798c2df1a..b017d99fd7e1 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -299,7 +299,7 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct 
cls_bpf_prog *prog)
 {
struct cls_bpf_head *head = rtnl_dereference(tp->root);
 
-   idr_remove_ext(>handle_idr, prog->handle);
+   idr_remove(>handle_idr, prog->handle);
cls_bpf_stop_offload(tp, prog);
list_del_rcu(>link);
tcf_unbind_filter(tp, >res);
@@ -542,7 +542,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff 
*in_skb,
cls_bpf_free_parms(prog);
 errout_idr:
if (!oldprog)
-   idr_remove_ext(>handle_idr, prog->handle);
+   idr_remove(>handle_idr, prog->handle);
 errout:
tcf_exts_destroy(>exts);
kfree(prog);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 543a3e875d05..3e89b0be1706 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -283,7 +283,7 @@ static void __fl_delete(struct tcf_proto *tp, struct 
cls_fl_filter *f)
 {
struct cls_fl_head *head = rtnl_dereference(tp->root);
 
-   idr_remove_ext(>handle_idr, f->handle);
+   idr_remove(>handle_idr, f->handle);
list_del_rcu(>list);
if (!tc_skip_hw(f->flags))
fl_hw_destroy_filter(tp, f);
@@ -972,7 +972,7 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
 
 errout_idr:
if (fnew->handle)
-   idr_remove_ext(>handle_idr, fnew->handle);
+   idr_remove(>handle_idr, fnew->handle);
 errout:
tcf_exts_destroy(>exts);
kfree(fnew);
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
i

[PATCH v2 04/17] IDR test suite: Check handling negative end correctly

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

One of the charming quirks of the idr_alloc() interface is that you
can pass a negative end and it will be interpreted as "maximum".  Ensure
we don't break that.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 tools/testing/radix-tree/idr-test.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/radix-tree/idr-test.c 
b/tools/testing/radix-tree/idr-test.c
index 193450b29bf0..892ef8855b02 100644
--- a/tools/testing/radix-tree/idr-test.c
+++ b/tools/testing/radix-tree/idr-test.c
@@ -207,6 +207,7 @@ void idr_checks(void)
assert(idr_alloc(, item, i, i + 10, GFP_KERNEL) == i);
}
assert(idr_alloc(, DUMMY_PTR, i - 2, i, GFP_KERNEL) == -ENOSPC);
+   assert(idr_alloc(, DUMMY_PTR, i - 2, i + 10, GFP_KERNEL) == 
-ENOSPC);
 
idr_for_each(, item_idr_free, );
idr_destroy();
-- 
2.15.0

[PATCH v2 00/17] IDR changes for v4.15-rc1

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

v2:
 - Rebased on f6454f80e8a965fca203dab28723f68ec78db608 to resolve
   conflicting changes with cls_bpf
 - Fix whitespace
 - Change a WARN_ON to WARN_ON_ONCE (git snafu yesterday)

Original cover letter:

The patches here are of three types:

 - Enhancing the test suite (fixing the build, adding a couple of new
   tests, fixing a bug in the test)
 - Replacing the 'extended' IDR API
 - Fixing some low-probability bugs 

As far as the 'extended' IDR API goes, this was added by Chris Mi to
permit a savvy user to use IDs up to ULONG_MAX in size (the traditional
IDR API only permits IDs to be INT_MAX).  It's harder to use, so we
wouldn't want to convert all users over to it.  But it can be made
easier to use than it currently is, which is what I've done here.  The
principal way that I've made it easier to use is by introducing
idr_alloc_u32(), which is what all but one of the existing users
actually want.

The last patch at the end I thought of just now -- what happens when
somebody adds an IDR entry with an ID > INT_MAX and then tries to
iterate over all entries in the IDR using an old interface that can't
return these large IDs?  It's not safe to return those IDs, so I've
settled for a dmesg warning and terminating the iteration.

Matthew Wilcox (17):
  idr: Fix build
  radix tree test suite: Remove ARRAY_SIZE
  idr test suite: Fix ida_test_random()
  IDR test suite: Check handling negative end correctly
  idr: Delete idr_remove_ext function
  idr: Delete idr_replace_ext function
  idr: Delete idr_find_ext function
  idr: Add idr_alloc_u32 helper
  net sched actions: Convert to use idr_alloc_u32
  cls_basic: Convert to use idr_alloc_u32
  cls_bpf: Convert to use idr_alloc_u32
  cls_flower: Convert to idr_alloc_u32
  cls_u32: Reinstate cyclic allocation
  cls_u32: Convert to idr_alloc_u32
  idr: Rename idr_alloc_ext to idr_alloc_ul
  idr: Rename idr_for_each_entry_ext
  idr: Warn if old iterators see large IDs

 include/linux/idr.h | 109 ++--
 include/linux/radix-tree.h  |  17 +---
 lib/idr.c   | 143 +++-
 lib/radix-tree.c|   3 +-
 net/sched/act_api.c |  72 +++-
 net/sched/cls_basic.c   |  33 
 net/sched/cls_bpf.c |  30 +++
 net/sched/cls_flower.c  |  34 
 net/sched/cls_u32.c |  51 +---
 tools/testing/radix-tree/idr-test.c |  22 -
 tools/testing/radix-tree/linux/kernel.h |   2 -
 11 files changed, 266 insertions(+), 250 deletions(-)

-- 
2.15.0

[PATCH v2 02/17] radix tree test suite: Remove ARRAY_SIZE

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

This is now defined in tools/include/linux/kernel.h, so our
definition generates a warning.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 tools/testing/radix-tree/linux/kernel.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/testing/radix-tree/linux/kernel.h 
b/tools/testing/radix-tree/linux/kernel.h
index c3bc3f364f68..426f32f28547 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -17,6 +17,4 @@
 #define pr_debug printk
 #define pr_cont printk
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-
 #endif /* _KERNEL_H */
-- 
2.15.0

[PATCH v2 03/17] idr test suite: Fix ida_test_random()

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

The test was checking the wrong errno; ida_get_new_above() returns
EAGAIN, not ENOMEM on memory allocation failure.  Double the number of
threads to increase the chance that we actually exercise this path
during the test suite (it was a bit sporadic before).

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 tools/testing/radix-tree/idr-test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/radix-tree/idr-test.c 
b/tools/testing/radix-tree/idr-test.c
index 30cd0b296f1a..193450b29bf0 100644
--- a/tools/testing/radix-tree/idr-test.c
+++ b/tools/testing/radix-tree/idr-test.c
@@ -380,7 +380,7 @@ void ida_check_random(void)
do {
ida_pre_get(, GFP_KERNEL);
err = ida_get_new_above(, bit, );
-   } while (err == -ENOMEM);
+   } while (err == -EAGAIN);
assert(!err);
assert(id == bit);
}
@@ -489,7 +489,7 @@ static void *ida_random_fn(void *arg)
 
 void ida_thread_tests(void)
 {
-   pthread_t threads[10];
+   pthread_t threads[20];
int i;
 
for (i = 0; i < ARRAY_SIZE(threads); i++)
-- 
2.15.0

[PATCH v2 09/17] net sched actions: Convert to use idr_alloc_u32

2017-11-29 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Use the new helper.  Also untangle the error path, and in so doing
noticed that estimator generator failure would lead to us leaking an
ID.  Fix that bug.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 net/sched/act_api.c | 60 ++---
 1 file changed, 25 insertions(+), 35 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index efb90b8a3bf0..156302c110af 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -274,7 +274,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
struct tcf_idrinfo *idrinfo = tn->idrinfo;
struct idr *idr = >action_idr;
int err = -ENOMEM;
-   unsigned long idr_index;
 
if (unlikely(!p))
return -ENOMEM;
@@ -284,45 +283,28 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
 
if (cpustats) {
p->cpu_bstats = netdev_alloc_pcpu_stats(struct 
gnet_stats_basic_cpu);
-   if (!p->cpu_bstats) {
-err1:
-   kfree(p);
-   return err;
-   }
-   p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
-   if (!p->cpu_qstats) {
-err2:
-   free_percpu(p->cpu_bstats);
+   if (!p->cpu_bstats)
goto err1;
-   }
+   p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+   if (!p->cpu_qstats)
+   goto err2;
}
spin_lock_init(>tcfa_lock);
+   idr_preload(GFP_KERNEL);
+   spin_lock_bh(>lock);
/* user doesn't specify an index */
if (!index) {
-   idr_preload(GFP_KERNEL);
-   spin_lock_bh(>lock);
-   err = idr_alloc_ext(idr, NULL, _index, 1, 0,
-   GFP_ATOMIC);
-   spin_unlock_bh(>lock);
-   idr_preload_end();
-   if (err) {
-err3:
-   free_percpu(p->cpu_qstats);
-   goto err2;
-   }
-   p->tcfa_index = idr_index;
+   index = 1;
+   err = idr_alloc_u32(idr, NULL, , UINT_MAX, GFP_ATOMIC);
} else {
-   idr_preload(GFP_KERNEL);
-   spin_lock_bh(>lock);
-   err = idr_alloc_ext(idr, NULL, NULL, index, index + 1,
-   GFP_ATOMIC);
-   spin_unlock_bh(>lock);
-   idr_preload_end();
-   if (err)
-   goto err3;
-   p->tcfa_index = index;
+   err = idr_alloc_u32(idr, NULL, , index, GFP_ATOMIC);
}
+   spin_unlock_bh(>lock);
+   idr_preload_end();
+   if (err)
+   goto err3;
 
+   p->tcfa_index = index;
p->tcfa_tm.install = jiffies;
p->tcfa_tm.lastuse = jiffies;
p->tcfa_tm.firstuse = 0;
@@ -330,9 +312,8 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
err = gen_new_estimator(>tcfa_bstats, p->cpu_bstats,
>tcfa_rate_est,
>tcfa_lock, NULL, est);
-   if (err) {
-   goto err3;
-   }
+   if (err)
+   goto err4;
}
 
p->idrinfo = idrinfo;
@@ -340,6 +321,15 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
INIT_LIST_HEAD(>list);
*a = p;
return 0;
+err4:
+   idr_remove(idr, index);
+err3:
+   free_percpu(p->cpu_qstats);
+err2:
+   free_percpu(p->cpu_bstats);
+err1:
+   kfree(p);
+   return err;
 }
 EXPORT_SYMBOL(tcf_idr_create);
 
-- 
2.15.0

Re: [PATCH 11/17] cls_bpf: Convert to use idr_alloc_u32

2017-11-29 Thread Matthew Wilcox

On Tue, Nov 28, 2017 at 05:08:40PM -0800, Jakub Kicinski wrote:
> On Tue, 28 Nov 2017 13:33:06 -0800, Matthew Wilcox wrote:
> > +   ret = idr_alloc_u32(>handle_idr, prog, ,
> > +   INT_MAX, GFP_KERNEL);
> > +   } else if (!oldprog) {
> > +   ret = idr_alloc_u32(>handle_idr, prog, ,
> > +   handle, GFP_KERNEL);
> 
> nit: in many places you seem to not align the second line with opening
> parenthesis.  Is that intentional?

It's more that I don't care.  I press 'enter', which indents the arguments
by a certain amount, then press the 'tab' key until it looks aesthetically
pleasing.

> FWIW there may be a small merge conflict with net on cls_bpf in patch
> 5, some of the code has been removed.

Thanks.  Dave, do you want to take the IDR patches through your tree to
save conflict resolution?

[PATCH 16/17] idr: Rename idr_for_each_entry_ext

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Match idr_alloc_ul with idr_get_next_ul and idr_for_each_entry_ul.
Also add kernel-doc.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h | 17 ++---
 lib/idr.c   | 20 +++-
 net/sched/act_api.c |  6 +++---
 3 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 344380fd0887..91d27a9bcdf4 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -88,7 +88,7 @@ int idr_alloc_cyclic(struct idr *, void *entry, int start, 
int end, gfp_t);
 int idr_for_each(const struct idr *,
 int (*fn)(int id, void *p, void *data), void *data);
 void *idr_get_next(struct idr *, int *nextid);
-void *idr_get_next_ext(struct idr *idr, unsigned long *nextid);
+void *idr_get_next_ul(struct idr *, unsigned long *nextid);
 void *idr_replace(struct idr *, void *, unsigned long id);
 void idr_destroy(struct idr *);
 
@@ -178,8 +178,19 @@ static inline void *idr_find(const struct idr *idr, 
unsigned long id)
  */
 #define idr_for_each_entry(idr, entry, id) \
for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; ++id)
-#define idr_for_each_entry_ext(idr, entry, id) \
-   for (id = 0; ((entry) = idr_get_next_ext(idr, &(id))) != NULL; ++id)
+
+/**
+ * idr_for_each_entry_ul() - iterate over an IDR's elements of a given type.
+ * @idr: IDR handle.
+ * @entry: The type * to use as cursor.
+ * @id: Entry ID.
+ *
+ * @entry and @id do not need to be initialized before the loop, and
+ * after normal terminatinon @entry is left with the value NULL.  This
+ * is convenient for a "not found" value.
+ */
+#define idr_for_each_entry_ul(idr, entry, id)  \
+   for (id = 0; ((entry) = idr_get_next_ul(idr, &(id))) != NULL; ++id)
 
 /**
  * idr_for_each_entry_continue - continue iteration over an idr's elements of 
a given type
diff --git a/lib/idr.c b/lib/idr.c
index 103afb97b4bd..772a24513d1e 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -155,9 +155,9 @@ int idr_for_each(const struct idr *idr,
 EXPORT_SYMBOL(idr_for_each);
 
 /**
- * idr_get_next - Find next populated entry
- * @idr: idr handle
- * @nextid: Pointer to lowest possible ID to return
+ * idr_get_next() - Find next populated entry.
+ * @idr: IDR handle.
+ * @nextid: Pointer to lowest possible ID to return.
  *
  * Returns the next populated entry in the tree with an ID greater than
  * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
@@ -178,7 +178,17 @@ void *idr_get_next(struct idr *idr, int *nextid)
 }
 EXPORT_SYMBOL(idr_get_next);
 
-void *idr_get_next_ext(struct idr *idr, unsigned long *nextid)
+/**
+ * idr_get_next_ul() - Find next populated entry.
+ * @idr: IDR handle.
+ * @nextid: Pointer to lowest possible ID to return.
+ *
+ * Returns the next populated entry in the tree with an ID greater than
+ * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
+ * to the ID of the found value.  To use in a loop, the value pointed to by
+ * nextid must be incremented by the user.
+ */
+void *idr_get_next_ul(struct idr *idr, unsigned long *nextid)
 {
struct radix_tree_iter iter;
void __rcu **slot;
@@ -190,7 +200,7 @@ void *idr_get_next_ext(struct idr *idr, unsigned long 
*nextid)
*nextid = iter.index;
return rcu_dereference_raw(*slot);
 }
-EXPORT_SYMBOL(idr_get_next_ext);
+EXPORT_SYMBOL(idr_get_next_ul);
 
 /**
  * idr_replace - replace pointer for given id
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 156302c110af..4133d91b7029 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -124,7 +124,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, 
struct sk_buff *skb,
 
s_i = cb->args[0];
 
-   idr_for_each_entry_ext(idr, p, id) {
+   idr_for_each_entry_ul(idr, p, id) {
index++;
if (index < s_i)
continue;
@@ -181,7 +181,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, 
struct sk_buff *skb,
if (nla_put_string(skb, TCA_KIND, ops->kind))
goto nla_put_failure;
 
-   idr_for_each_entry_ext(idr, p, id) {
+   idr_for_each_entry_ul(idr, p, id) {
ret = __tcf_idr_release(p, false, true);
if (ret == ACT_P_DELETED) {
module_put(ops->owner);
@@ -351,7 +351,7 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
int ret;
unsigned long id = 1;
 
-   idr_for_each_entry_ext(idr, p, id) {
+   idr_for_each_entry_ul(idr, p, id) {
ret = __tcf_idr_release(p, false, true);
if (ret == ACT_P_DELETED)
module_put(ops->owner);
-- 
2.15.0

[PATCH 01/17] idr: Fix build

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

The IDR calls WARN_ON without including 

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 7c3a365f7e12..dd048cf456b7 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -13,6 +13,7 @@
 #define __IDR_H__
 
 #include 
+#include 
 #include 
 #include 
 
-- 
2.15.0

[PATCH 17/17] idr: Warn if old iterators see large IDs

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Now that the IDR can be used to store large IDs, it is possible somebody
might only partially convert their old code and use the iterators which
can only handle IDs up to INT_MAX.  It's probably unwise to show them a
truncated ID, so settle for spewing warnings to dmesg, and terminating
the iteration.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 lib/idr.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lib/idr.c b/lib/idr.c
index 772a24513d1e..f56133a600bb 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -145,7 +145,11 @@ int idr_for_each(const struct idr *idr,
void __rcu **slot;
 
radix_tree_for_each_slot(slot, >idr_rt, , 0) {
-   int ret = fn(iter.index, rcu_dereference_raw(*slot), data);
+   int ret;
+
+   if (WARN_ON(iter.index > INT_MAX))
+   break;
+   ret = fn(iter.index, rcu_dereference_raw(*slot), data);
if (ret)
return ret;
}
@@ -173,6 +177,9 @@ void *idr_get_next(struct idr *idr, int *nextid)
if (!slot)
return NULL;
 
+   if (WARN_ON_ONCE(iter.index > INT_MAX))
+   return NULL;
+
*nextid = iter.index;
return rcu_dereference_raw(*slot);
 }
-- 
2.15.0

[PATCH 03/17] idr test suite: Fix ida_test_random()

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

The test was checking the wrong errno; ida_get_new_above() returns
EAGAIN, not ENOMEM on memory allocation failure.  Double the number of
threads to increase the chance that we actually exercise this path
during the test suite (it was a bit sporadic before).

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 tools/testing/radix-tree/idr-test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/radix-tree/idr-test.c 
b/tools/testing/radix-tree/idr-test.c
index 30cd0b296f1a..193450b29bf0 100644
--- a/tools/testing/radix-tree/idr-test.c
+++ b/tools/testing/radix-tree/idr-test.c
@@ -380,7 +380,7 @@ void ida_check_random(void)
do {
ida_pre_get(, GFP_KERNEL);
err = ida_get_new_above(, bit, );
-   } while (err == -ENOMEM);
+   } while (err == -EAGAIN);
assert(!err);
assert(id == bit);
}
@@ -489,7 +489,7 @@ static void *ida_random_fn(void *arg)
 
 void ida_thread_tests(void)
 {
-   pthread_t threads[10];
+   pthread_t threads[20];
int i;
 
for (i = 0; i < ARRAY_SIZE(threads); i++)
-- 
2.15.0

[PATCH 04/17] IDR test suite: Check handling negative end correctly

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

One of the charming quirks of the idr_alloc() interface is that you
can pass a negative end and it will be interpreted as "maximum".  Ensure
we don't break that.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 tools/testing/radix-tree/idr-test.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/radix-tree/idr-test.c 
b/tools/testing/radix-tree/idr-test.c
index 193450b29bf0..892ef8855b02 100644
--- a/tools/testing/radix-tree/idr-test.c
+++ b/tools/testing/radix-tree/idr-test.c
@@ -207,6 +207,7 @@ void idr_checks(void)
assert(idr_alloc(, item, i, i + 10, GFP_KERNEL) == i);
}
assert(idr_alloc(, DUMMY_PTR, i - 2, i, GFP_KERNEL) == -ENOSPC);
+   assert(idr_alloc(, DUMMY_PTR, i - 2, i + 10, GFP_KERNEL) == 
-ENOSPC);
 
idr_for_each(, item_idr_free, );
idr_destroy();
-- 
2.15.0

[PATCH 05/17] idr: Delete idr_remove_ext function

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Simply changing idr_remove's 'id' argument to 'unsigned long' suffices
for all callers.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h| 7 +--
 net/sched/act_api.c| 2 +-
 net/sched/cls_basic.c  | 6 +++---
 net/sched/cls_bpf.c| 6 +++---
 net/sched/cls_flower.c | 4 ++--
 net/sched/cls_u32.c| 8 
 6 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index dd048cf456b7..9a4042489ec6 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -140,16 +140,11 @@ void *idr_replace(struct idr *, void *, int id);
 void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id);
 void idr_destroy(struct idr *);
 
-static inline void *idr_remove_ext(struct idr *idr, unsigned long id)
+static inline void *idr_remove(struct idr *idr, unsigned long id)
 {
return radix_tree_delete_item(>idr_rt, id, NULL);
 }
 
-static inline void *idr_remove(struct idr *idr, int id)
-{
-   return idr_remove_ext(idr, id);
-}
-
 static inline void idr_init(struct idr *idr)
 {
INIT_RADIX_TREE(>idr_rt, IDR_RT_MARKER);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 4d33a50a8a6d..bab81574a420 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -78,7 +78,7 @@ static void free_tcf(struct tc_action *p)
 static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p)
 {
spin_lock_bh(>lock);
-   idr_remove_ext(>action_idr, p->tcfa_index);
+   idr_remove(>action_idr, p->tcfa_index);
spin_unlock_bh(>lock);
gen_kill_estimator(>tcfa_rate_est);
free_tcf(p);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 5f169ded347e..d2193304bad0 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -120,7 +120,7 @@ static void basic_destroy(struct tcf_proto *tp)
list_for_each_entry_safe(f, n, >flist, link) {
list_del_rcu(>link);
tcf_unbind_filter(tp, >res);
-   idr_remove_ext(>handle_idr, f->handle);
+   idr_remove(>handle_idr, f->handle);
if (tcf_exts_get_net(>exts))
call_rcu(>rcu, basic_delete_filter);
else
@@ -137,7 +137,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, 
bool *last)
 
list_del_rcu(>link);
tcf_unbind_filter(tp, >res);
-   idr_remove_ext(>handle_idr, f->handle);
+   idr_remove(>handle_idr, f->handle);
tcf_exts_get_net(>exts);
call_rcu(>rcu, basic_delete_filter);
*last = list_empty(>flist);
@@ -224,7 +224,7 @@ static int basic_change(struct net *net, struct sk_buff 
*in_skb,
err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr);
if (err < 0) {
if (!fold)
-   idr_remove_ext(>handle_idr, fnew->handle);
+   idr_remove(>handle_idr, fnew->handle);
goto errout;
}
 
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index a9f3e317055c..810c824eea81 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -294,7 +294,7 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct 
cls_bpf_prog *prog)
 {
struct cls_bpf_head *head = rtnl_dereference(tp->root);
 
-   idr_remove_ext(>handle_idr, prog->handle);
+   idr_remove(>handle_idr, prog->handle);
cls_bpf_stop_offload(tp, prog);
list_del_rcu(>link);
tcf_unbind_filter(tp, >res);
@@ -516,7 +516,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff 
*in_skb,
ret = cls_bpf_offload(tp, prog, oldprog);
if (ret) {
if (!oldprog)
-   idr_remove_ext(>handle_idr, prog->handle);
+   idr_remove(>handle_idr, prog->handle);
__cls_bpf_delete_prog(prog);
return ret;
}
@@ -539,7 +539,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff 
*in_skb,
 
 errout_idr:
if (!oldprog)
-   idr_remove_ext(>handle_idr, prog->handle);
+   idr_remove(>handle_idr, prog->handle);
 errout:
tcf_exts_destroy(>exts);
kfree(prog);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 543a3e875d05..3e89b0be1706 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -283,7 +283,7 @@ static void __fl_delete(struct tcf_proto *tp, struct 
cls_fl_filter *f)
 {
struct cls_fl_head *head = rtnl_dereference(tp->root);
 
-   idr_remove_ext(>handle_idr, f->handle);
+   idr_remove(>handle_idr, f->handle);
list_del_rcu(>list);
if (!tc_skip_hw(f->flags))
fl_hw_destroy_filter(tp, f);
@@ -972,7 +972,7 @@ static int f

[PATCH 14/17] cls_u32: Convert to idr_alloc_u32

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

No real benefit to this classifier, but since we're allocating a u32
anyway, we should use this function.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 net/sched/cls_u32.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index e65b47483dc0..2fede6a55d04 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -970,8 +970,8 @@ static int u32_change(struct net *net, struct sk_buff 
*in_skb,
return -ENOMEM;
}
} else {
-   err = idr_alloc_ext(_c->handle_idr, ht, NULL,
-   handle, handle + 1, GFP_KERNEL);
+   err = idr_alloc_u32(_c->handle_idr, ht, ,
+   handle, GFP_KERNEL);
if (err) {
kfree(ht);
return err;
@@ -1020,8 +1020,7 @@ static int u32_change(struct net *net, struct sk_buff 
*in_skb,
if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
return -EINVAL;
handle = htid | TC_U32_NODE(handle);
-   err = idr_alloc_ext(>handle_idr, NULL, NULL,
-   handle, handle + 1,
+   err = idr_alloc_u32(>handle_idr, NULL, , handle,
GFP_KERNEL);
if (err)
return err;
-- 
2.15.0

[PATCH 13/17] cls_u32: Reinstate cyclic allocation

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Commit e7614370d6f0 ("net_sched: use idr to allocate u32 filter handles)
converted htid allocation to use the IDR.  The ID allocated by this
scheme changes; it used to be cyclic, but now always allocates the
lowest available.  The IDR supports cyclic allocation, so just use
the right function.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 net/sched/cls_u32.c | 14 --
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 9d48674a70e0..e65b47483dc0 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -316,19 +316,13 @@ static void *u32_get(struct tcf_proto *tp, u32 handle)
return u32_lookup_key(ht, handle);
 }
 
+/* Protected by rtnl lock */
 static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr)
 {
-   unsigned long idr_index;
-   int err;
-
-   /* This is only used inside rtnl lock it is safe to increment
-* without read _copy_ update semantics
-*/
-   err = idr_alloc_ext(_c->handle_idr, ptr, _index,
-   1, 0x7FF, GFP_KERNEL);
-   if (err)
+   int id = idr_alloc_cyclic(_c->handle_idr, ptr, 1, 0x7FF, GFP_KERNEL);
+   if (id < 0)
return 0;
-   return (u32)(idr_index | 0x800) << 20;
+   return (id | 0x800U) << 20;
 }
 
 static struct hlist_head *tc_u_common_hash;
-- 
2.15.0

[PATCH 08/17] idr: Add idr_alloc_u32 helper

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

All current users of idr_alloc_ext() actually want to allocate a u32 and
it's a little painful for them to use idr_alloc_ext().  This convenience
function makes it simple.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h | 29 +
 1 file changed, 29 insertions(+)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 12514ec0cd28..9b2fd6f408b2 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -139,6 +139,35 @@ void *idr_get_next_ext(struct idr *idr, unsigned long 
*nextid);
 void *idr_replace(struct idr *, void *, unsigned long id);
 void idr_destroy(struct idr *);
 
+/**
+ * idr_alloc_u32() - Allocate an ID.
+ * @idr: IDR handle.
+ * @ptr: Pointer to be associated with the new ID.
+ * @nextid: The new ID.
+ * @max: The maximum ID to allocate (inclusive).
+ * @gfp: Memory allocation flags.
+ *
+ * Allocates an unused ID in the range [*nextid, max] and updates the @nextid
+ * pointer with the newly assigned ID.  Returns -ENOSPC and does not modify
+ * @nextid if there are no unused IDs in the range.
+ *
+ * The caller should provide their own locking to ensure that two concurrent
+ * modifications to the IDR are not possible.  Read-only accesses to the
+ * IDR may be done under the RCU read lock or may exclude simultaneous
+ * writers.
+ *
+ * Return: 0 on success, -ENOMEM for memory allocation errors, -ENOSPC if
+ * there are no free IDs in the range.
+ */
+static inline int __must_check idr_alloc_u32(struct idr *idr, void *ptr,
+   u32 *nextid, unsigned long max, gfp_t gfp)
+{
+   unsigned long tmp = *nextid;
+   int ret = idr_alloc_ext(idr, ptr, , tmp, max + 1, gfp);
+   *nextid = tmp;
+   return ret;
+}
+
 static inline void *idr_remove(struct idr *idr, unsigned long id)
 {
return radix_tree_delete_item(>idr_rt, id, NULL);
-- 
2.15.0

[PATCH 07/17] idr: Delete idr_find_ext function

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Simply changing idr_remove's 'id' argument to 'unsigned long' works
for all callers.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h| 7 +--
 net/sched/act_api.c| 2 +-
 net/sched/cls_flower.c | 2 +-
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 90dbe7a3735c..12514ec0cd28 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -179,16 +179,11 @@ static inline void idr_preload_end(void)
  * This function can be called under rcu_read_lock(), given that the leaf
  * pointers lifetimes are correctly managed.
  */
-static inline void *idr_find_ext(const struct idr *idr, unsigned long id)
+static inline void *idr_find(const struct idr *idr, unsigned long id)
 {
return radix_tree_lookup(>idr_rt, id);
 }
 
-static inline void *idr_find(const struct idr *idr, int id)
-{
-   return idr_find_ext(idr, id);
-}
-
 /**
  * idr_for_each_entry - iterate over an idr's elements of a given type
  * @idr: idr handle
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 7e901e855d68..efb90b8a3bf0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -222,7 +222,7 @@ static struct tc_action *tcf_idr_lookup(u32 index, struct 
tcf_idrinfo *idrinfo)
struct tc_action *p = NULL;
 
spin_lock_bh(>lock);
-   p = idr_find_ext(>action_idr, index);
+   p = idr_find(>action_idr, index);
spin_unlock_bh(>lock);
 
return p;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index ca71823bee03..ec0dc92f6104 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -329,7 +329,7 @@ static void *fl_get(struct tcf_proto *tp, u32 handle)
 {
struct cls_fl_head *head = rtnl_dereference(tp->root);
 
-   return idr_find_ext(>handle_idr, handle);
+   return idr_find(>handle_idr, handle);
 }
 
 static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
-- 
2.15.0

[PATCH 00/17] IDR patches for 4.15

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

The patches here are of three types:

 - Enhancing the test suite (fixing the build, adding a couple of new
   tests, fixing a bug in the test)
 - Replacing the 'extended' IDR API
 - Fixing some low-probability bugs 

As far as the 'extended' IDR API goes, this was added by Chris Mi to
permit a savvy user to use IDs up to ULONG_MAX in size (the traditional
IDR API only permits IDs to be INT_MAX).  It's harder to use, so we
wouldn't want to convert all users over to it.  But it can be made
easier to use than it currently is, which is what I've done here.  The
principal way that I've made it easier to use is by introducing
idr_alloc_u32(), which is what all but one of the existing users
actually want.

The last patch at the end I thought of just now -- what happens when
somebody adds an IDR entry with an ID > INT_MAX and then tries to
iterate over all entries in the IDR using an old interface that can't
return these large IDs?  It's not safe to return those IDs, so I've
settled for a dmesg warning and terminating the iteration.

Most of these patches have been sitting in my xarray tree in one form or
another for over a month.  I haven't seen anything from 0day to indicate
a problem here, but then there are as yet very few users and I'm not
sure 0day has covered any of them.

Matthew Wilcox (17):
  idr: Fix build
  radix tree test suite: Remove ARRAY_SIZE
  idr test suite: Fix ida_test_random()
  IDR test suite: Check handling negative end correctly
  idr: Delete idr_remove_ext function
  idr: Delete idr_replace_ext function
  idr: Delete idr_find_ext function
  idr: Add idr_alloc_u32 helper
  net sched actions: Convert to use idr_alloc_u32
  cls_basic: Convert to use idr_alloc_u32
  cls_bpf: Convert to use idr_alloc_u32
  cls_flower: Convert to idr_alloc_u32
  cls_u32: Reinstate cyclic allocation
  cls_u32: Convert to idr_alloc_u32
  idr: Rename idr_alloc_ext to idr_alloc_ul
  idr: Rename idr_for_each_entry_ext
  idr: Warn if old iterators see large IDs

 include/linux/idr.h | 109 ++--
 include/linux/radix-tree.h  |  17 +---
 lib/idr.c   | 143 +++-
 lib/radix-tree.c|   3 +-
 net/sched/act_api.c |  72 +++-
 net/sched/cls_basic.c   |  33 
 net/sched/cls_bpf.c |  32 ---
 net/sched/cls_flower.c  |  34 
 net/sched/cls_u32.c |  51 +---
 tools/testing/radix-tree/idr-test.c |  22 -
 tools/testing/radix-tree/linux/kernel.h |   2 -
 11 files changed, 267 insertions(+), 251 deletions(-)

-- 
2.15.0

[PATCH 09/17] net sched actions: Convert to use idr_alloc_u32

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Use the new helper.  Also untangle the error path, and in so doing
noticed that estimator generator failure would lead to us leaking an
ID.  Fix that bug.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 net/sched/act_api.c | 60 ++---
 1 file changed, 25 insertions(+), 35 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index efb90b8a3bf0..156302c110af 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -274,7 +274,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
struct tcf_idrinfo *idrinfo = tn->idrinfo;
struct idr *idr = >action_idr;
int err = -ENOMEM;
-   unsigned long idr_index;
 
if (unlikely(!p))
return -ENOMEM;
@@ -284,45 +283,28 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
 
if (cpustats) {
p->cpu_bstats = netdev_alloc_pcpu_stats(struct 
gnet_stats_basic_cpu);
-   if (!p->cpu_bstats) {
-err1:
-   kfree(p);
-   return err;
-   }
-   p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
-   if (!p->cpu_qstats) {
-err2:
-   free_percpu(p->cpu_bstats);
+   if (!p->cpu_bstats)
goto err1;
-   }
+   p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+   if (!p->cpu_qstats)
+   goto err2;
}
spin_lock_init(>tcfa_lock);
+   idr_preload(GFP_KERNEL);
+   spin_lock_bh(>lock);
/* user doesn't specify an index */
if (!index) {
-   idr_preload(GFP_KERNEL);
-   spin_lock_bh(>lock);
-   err = idr_alloc_ext(idr, NULL, _index, 1, 0,
-   GFP_ATOMIC);
-   spin_unlock_bh(>lock);
-   idr_preload_end();
-   if (err) {
-err3:
-   free_percpu(p->cpu_qstats);
-   goto err2;
-   }
-   p->tcfa_index = idr_index;
+   index = 1;
+   err = idr_alloc_u32(idr, NULL, , UINT_MAX, GFP_ATOMIC);
} else {
-   idr_preload(GFP_KERNEL);
-   spin_lock_bh(>lock);
-   err = idr_alloc_ext(idr, NULL, NULL, index, index + 1,
-   GFP_ATOMIC);
-   spin_unlock_bh(>lock);
-   idr_preload_end();
-   if (err)
-   goto err3;
-   p->tcfa_index = index;
+   err = idr_alloc_u32(idr, NULL, , index, GFP_ATOMIC);
}
+   spin_unlock_bh(>lock);
+   idr_preload_end();
+   if (err)
+   goto err3;
 
+   p->tcfa_index = index;
p->tcfa_tm.install = jiffies;
p->tcfa_tm.lastuse = jiffies;
p->tcfa_tm.firstuse = 0;
@@ -330,9 +312,8 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
err = gen_new_estimator(>tcfa_bstats, p->cpu_bstats,
>tcfa_rate_est,
>tcfa_lock, NULL, est);
-   if (err) {
-   goto err3;
-   }
+   if (err)
+   goto err4;
}
 
p->idrinfo = idrinfo;
@@ -340,6 +321,15 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
INIT_LIST_HEAD(>list);
*a = p;
return 0;
+err4:
+   idr_remove(idr, index);
+err3:
+   free_percpu(p->cpu_qstats);
+err2:
+   free_percpu(p->cpu_bstats);
+err1:
+   kfree(p);
+   return err;
 }
 EXPORT_SYMBOL(tcf_idr_create);
 
-- 
2.15.0

[PATCH 10/17] cls_basic: Convert to use idr_alloc_u32

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Use the new helper which saves a temporary variable and a few lines of
code.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 net/sched/cls_basic.c | 25 ++---
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 147700afcf31..529597c8c28c 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -182,7 +182,6 @@ static int basic_change(struct net *net, struct sk_buff 
*in_skb,
struct nlattr *tb[TCA_BASIC_MAX + 1];
struct basic_filter *fold = (struct basic_filter *) *arg;
struct basic_filter *fnew;
-   unsigned long idr_index;
 
if (tca[TCA_OPTIONS] == NULL)
return -EINVAL;
@@ -205,21 +204,17 @@ static int basic_change(struct net *net, struct sk_buff 
*in_skb,
if (err < 0)
goto errout;
 
-   if (handle) {
-   fnew->handle = handle;
-   if (!fold) {
-   err = idr_alloc_ext(>handle_idr, fnew, _index,
-   handle, handle + 1, GFP_KERNEL);
-   if (err)
-   goto errout;
-   }
-   } else {
-   err = idr_alloc_ext(>handle_idr, fnew, _index,
-   1, 0x7FFF, GFP_KERNEL);
-   if (err)
-   goto errout;
-   fnew->handle = idr_index;
+   if (!handle) {
+   handle = 1;
+   err = idr_alloc_u32(>handle_idr, fnew, ,
+   INT_MAX, GFP_KERNEL);
+   } else if (!fold) {
+   err = idr_alloc_u32(>handle_idr, fnew, ,
+   handle, GFP_KERNEL);
}
+   if (err)
+   goto errout;
+   fnew->handle = handle;
 
err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr);
if (err < 0) {
-- 
2.15.0

[PATCH 06/17] idr: Delete idr_replace_ext function

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Changing idr_replace's 'id' argument to 'unsigned long' works for all
callers.  Callers which passed a negative ID now get -ENOENT instead of
-EINVAL.  No callers relied on this error value.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h|  3 +--
 lib/idr.c  | 15 +++
 net/sched/act_api.c|  2 +-
 net/sched/cls_basic.c  |  2 +-
 net/sched/cls_bpf.c|  2 +-
 net/sched/cls_flower.c |  2 +-
 net/sched/cls_u32.c|  2 +-
 7 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 9a4042489ec6..90dbe7a3735c 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -136,8 +136,7 @@ int idr_for_each(const struct idr *,
 int (*fn)(int id, void *p, void *data), void *data);
 void *idr_get_next(struct idr *, int *nextid);
 void *idr_get_next_ext(struct idr *idr, unsigned long *nextid);
-void *idr_replace(struct idr *, void *, int id);
-void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id);
+void *idr_replace(struct idr *, void *, unsigned long id);
 void idr_destroy(struct idr *);
 
 static inline void *idr_remove(struct idr *idr, unsigned long id)
diff --git a/lib/idr.c b/lib/idr.c
index 2593ce513a18..577bfd4fe5c2 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -147,18 +147,9 @@ EXPORT_SYMBOL(idr_get_next_ext);
  * the one being replaced!).
  *
  * Returns: the old value on success.  %-ENOENT indicates that @id was not
- * found.  %-EINVAL indicates that @id or @ptr were not valid.
+ * found.  %-EINVAL indicates that @ptr was not valid.
  */
-void *idr_replace(struct idr *idr, void *ptr, int id)
-{
-   if (id < 0)
-   return ERR_PTR(-EINVAL);
-
-   return idr_replace_ext(idr, ptr, id);
-}
-EXPORT_SYMBOL(idr_replace);
-
-void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id)
+void *idr_replace(struct idr *idr, void *ptr, unsigned long id)
 {
struct radix_tree_node *node;
void __rcu **slot = NULL;
@@ -175,7 +166,7 @@ void *idr_replace_ext(struct idr *idr, void *ptr, unsigned 
long id)
 
return entry;
 }
-EXPORT_SYMBOL(idr_replace_ext);
+EXPORT_SYMBOL(idr_replace);
 
 /**
  * DOC: IDA description
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index bab81574a420..7e901e855d68 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -348,7 +348,7 @@ void tcf_idr_insert(struct tc_action_net *tn, struct 
tc_action *a)
struct tcf_idrinfo *idrinfo = tn->idrinfo;
 
spin_lock_bh(>lock);
-   idr_replace_ext(>action_idr, a, a->tcfa_index);
+   idr_replace(>action_idr, a, a->tcfa_index);
spin_unlock_bh(>lock);
 }
 EXPORT_SYMBOL(tcf_idr_insert);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index d2193304bad0..147700afcf31 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -231,7 +231,7 @@ static int basic_change(struct net *net, struct sk_buff 
*in_skb,
*arg = fnew;
 
if (fold) {
-   idr_replace_ext(>handle_idr, fnew, fnew->handle);
+   idr_replace(>handle_idr, fnew, fnew->handle);
list_replace_rcu(>link, >link);
tcf_unbind_filter(tp, >res);
tcf_exts_get_net(>exts);
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 810c824eea81..b7c5c3150086 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -525,7 +525,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff 
*in_skb,
prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW;
 
if (oldprog) {
-   idr_replace_ext(>handle_idr, prog, handle);
+   idr_replace(>handle_idr, prog, handle);
list_replace_rcu(>link, >link);
tcf_unbind_filter(tp, >res);
tcf_exts_get_net(>exts);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 3e89b0be1706..ca71823bee03 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -958,7 +958,7 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
 
if (fold) {
fnew->handle = handle;
-   idr_replace_ext(>handle_idr, fnew, fnew->handle);
+   idr_replace(>handle_idr, fnew, fnew->handle);
list_replace_rcu(>list, >list);
tcf_unbind_filter(tp, >res);
tcf_exts_get_net(>exts);
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 6fe4e3549ad3..9d48674a70e0 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -833,7 +833,7 @@ static void u32_replace_knode(struct tcf_proto *tp, struct 
tc_u_common *tp_c,
if (pins->handle == n->handle)
break;
 
-   idr_replace_ext(>handle_idr, n, n->handle);
+   idr_replace(>handle_idr, n, n->handle);
RCU_INIT_POINTER(n->next, pins->next);
rcu_assign_pointer(*ins, n);
 }
-- 
2.15.0

[PATCH 12/17] cls_flower: Convert to idr_alloc_u32

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Use the new helper which saves a temporary variable and a few lines
of code.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 net/sched/cls_flower.c | 26 ++
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index ec0dc92f6104..77a7a3c883b7 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -858,7 +858,6 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
struct cls_fl_filter *fnew;
struct nlattr **tb;
struct fl_flow_mask mask = {};
-   unsigned long idr_index;
int err;
 
if (!tca[TCA_OPTIONS])
@@ -889,21 +888,17 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
goto errout;
 
if (!handle) {
-   err = idr_alloc_ext(>handle_idr, fnew, _index,
-   1, 0x8000, GFP_KERNEL);
-   if (err)
-   goto errout;
-   fnew->handle = idr_index;
-   }
-
-   /* user specifies a handle and it doesn't exist */
-   if (handle && !fold) {
-   err = idr_alloc_ext(>handle_idr, fnew, _index,
-   handle, handle + 1, GFP_KERNEL);
-   if (err)
-   goto errout;
-   fnew->handle = idr_index;
+   handle = 1;
+   err = idr_alloc_u32(>handle_idr, fnew, ,
+   INT_MAX, GFP_KERNEL);
+   } else if (!fold) {
+   /* user specifies a handle and it doesn't exist */
+   err = idr_alloc_u32(>handle_idr, fnew, ,
+   handle, GFP_KERNEL);
}
+   if (err)
+   goto errout;
+   fnew->handle = handle;
 
if (tb[TCA_FLOWER_FLAGS]) {
fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
@@ -957,7 +952,6 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
*arg = fnew;
 
if (fold) {
-   fnew->handle = handle;
idr_replace(>handle_idr, fnew, fnew->handle);
list_replace_rcu(>list, >list);
tcf_unbind_filter(tp, >res);
-- 
2.15.0

[PATCH 02/17] radix tree test suite: Remove ARRAY_SIZE

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

This is now defined in tools/include/linux/kernel.h, so our
definition generates a warning.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 tools/testing/radix-tree/linux/kernel.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/testing/radix-tree/linux/kernel.h 
b/tools/testing/radix-tree/linux/kernel.h
index c3bc3f364f68..426f32f28547 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -17,6 +17,4 @@
 #define pr_debug printk
 #define pr_cont printk
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-
 #endif /* _KERNEL_H */
-- 
2.15.0

[PATCH 15/17] idr: Rename idr_alloc_ext to idr_alloc_ul

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

idr_alloc_ul fits better with other parts of the Linux kernel where we
need to name a function based on the types it operates on.

It uses a 'nextid' pointer argument instead of separate minimum ID and
output assigned ID, (like idr_get_next), reducing the number of arguments
by one.  It also takes a 'max' argument rather than an 'end' argument
(unlike idr_alloc, but the semantics of 'end' don't work for unsigned long
arguments).  And its return value is an errno, so mark it as __must_check.

Includes kernel-doc for idr_alloc_ul, which idr_alloc_ext didn't have,
and I realised we were missing a test-case where idr_alloc_cyclic wraps
around INT_MAX.  Chris Mi <chr...@mellanox.com> has promised to contribute
test-cases for idr_alloc_ul.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h | 55 ++---
 include/linux/radix-tree.h  | 17 +--
 lib/idr.c   | 99 +
 lib/radix-tree.c|  3 +-
 net/sched/cls_u32.c | 20 
 tools/testing/radix-tree/idr-test.c | 17 +++
 6 files changed, 111 insertions(+), 100 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 9b2fd6f408b2..344380fd0887 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -13,7 +13,6 @@
 #define __IDR_H__
 
 #include 
-#include 
 #include 
 #include 
 
@@ -82,55 +81,9 @@ static inline void idr_set_cursor(struct idr *idr, unsigned 
int val)
 
 void idr_preload(gfp_t gfp_mask);
 
-int idr_alloc_cmn(struct idr *idr, void *ptr, unsigned long *index,
- unsigned long start, unsigned long end, gfp_t gfp,
- bool ext);
-
-/**
- * idr_alloc - allocate an id
- * @idr: idr handle
- * @ptr: pointer to be associated with the new id
- * @start: the minimum id (inclusive)
- * @end: the maximum id (exclusive)
- * @gfp: memory allocation flags
- *
- * Allocates an unused ID in the range [start, end).  Returns -ENOSPC
- * if there are no unused IDs in that range.
- *
- * Note that @end is treated as max when <= 0.  This is to always allow
- * using @start + N as @end as long as N is inside integer range.
- *
- * Simultaneous modifications to the @idr are not allowed and should be
- * prevented by the user, usually with a lock.  idr_alloc() may be called
- * concurrently with read-only accesses to the @idr, such as idr_find() and
- * idr_for_each_entry().
- */
-static inline int idr_alloc(struct idr *idr, void *ptr,
-   int start, int end, gfp_t gfp)
-{
-   unsigned long id;
-   int ret;
-
-   if (WARN_ON_ONCE(start < 0))
-   return -EINVAL;
-
-   ret = idr_alloc_cmn(idr, ptr, , start, end, gfp, false);
-
-   if (ret)
-   return ret;
-
-   return id;
-}
-
-static inline int idr_alloc_ext(struct idr *idr, void *ptr,
-   unsigned long *index,
-   unsigned long start,
-   unsigned long end,
-   gfp_t gfp)
-{
-   return idr_alloc_cmn(idr, ptr, index, start, end, gfp, true);
-}
-
+int idr_alloc(struct idr *, void *, int start, int end, gfp_t);
+int __must_check idr_alloc_ul(struct idr *, void *, unsigned long *nextid,
+   unsigned long max, gfp_t);
 int idr_alloc_cyclic(struct idr *, void *entry, int start, int end, gfp_t);
 int idr_for_each(const struct idr *,
 int (*fn)(int id, void *p, void *data), void *data);
@@ -163,7 +116,7 @@ static inline int __must_check idr_alloc_u32(struct idr 
*idr, void *ptr,
u32 *nextid, unsigned long max, gfp_t gfp)
 {
unsigned long tmp = *nextid;
-   int ret = idr_alloc_ext(idr, ptr, , tmp, max + 1, gfp);
+   int ret = idr_alloc_ul(idr, ptr, , max, gfp);
*nextid = tmp;
return ret;
 }
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 23a9c89c7ad9..fc55ff31eca7 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -356,24 +356,9 @@ int radix_tree_split(struct radix_tree_root *, unsigned 
long index,
 int radix_tree_join(struct radix_tree_root *, unsigned long index,
unsigned new_order, void *);
 
-void __rcu **idr_get_free_cmn(struct radix_tree_root *root,
+void __rcu **idr_get_free(struct radix_tree_root *root,
  struct radix_tree_iter *iter, gfp_t gfp,
  unsigned long max);
-static inline void __rcu **idr_get_free(struct radix_tree_root *root,
-   struct radix_tree_iter *iter,
-   gfp_t gfp,
-   int end)
-{
-   return idr_get_free_cmn(root, iter, gfp, end > 0 ? end - 1 : INT_MAX);
-}
-
-static inline void __rc

[PATCH 11/17] cls_bpf: Convert to use idr_alloc_u32

2017-11-28 Thread Matthew Wilcox

From: Matthew Wilcox <mawil...@microsoft.com>

Use the new helper.  This has a modest reduction in both lines of code
and compiled code size.

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 net/sched/cls_bpf.c | 24 ++--
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index b7c5c3150086..82050b240842 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -467,7 +467,6 @@ static int cls_bpf_change(struct net *net, struct sk_buff 
*in_skb,
struct cls_bpf_prog *oldprog = *arg;
struct nlattr *tb[TCA_BPF_MAX + 1];
struct cls_bpf_prog *prog;
-   unsigned long idr_index;
int ret;
 
if (tca[TCA_OPTIONS] == NULL)
@@ -494,21 +493,18 @@ static int cls_bpf_change(struct net *net, struct sk_buff 
*in_skb,
}
 
if (handle == 0) {
-   ret = idr_alloc_ext(>handle_idr, prog, _index,
-   1, 0x7FFF, GFP_KERNEL);
-   if (ret)
-   goto errout;
-   prog->handle = idr_index;
-   } else {
-   if (!oldprog) {
-   ret = idr_alloc_ext(>handle_idr, prog, _index,
-   handle, handle + 1, GFP_KERNEL);
-   if (ret)
-   goto errout;
-   }
-   prog->handle = handle;
+   handle = 1;
+   ret = idr_alloc_u32(>handle_idr, prog, ,
+   INT_MAX, GFP_KERNEL);
+   } else if (!oldprog) {
+   ret = idr_alloc_u32(>handle_idr, prog, ,
+   handle, GFP_KERNEL);
}
 
+   if (ret)
+   goto errout;
+   prog->handle = handle;
+
ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
if (ret < 0)
goto errout_idr;
-- 
2.15.0

Re: [PATCH net-next 2/2] ncpfs: move net/ncpfs to drivers/staging/ncpfs

2017-11-14 Thread Matthew Wilcox

On Sun, Nov 12, 2017 at 11:22:27AM -0800, Stephen Hemminger wrote:
> The Netware Core Protocol is a file system that talks to
> Netware clients over IPX. Since IPX has been dead for many years
> move the file system into staging for eventual interment.

Should probably cc Petr on this ... (added)

Re: [PATCH 00/12] radix-tree: split out struct radix_tree_root out to

2017-10-10 Thread Matthew Wilcox

On Tue, Oct 10, 2017 at 09:56:22PM +0900, Masahiro Yamada wrote:
> One refactoring alone does not produce much benefits,
> but making continuous efforts will disentangle the knotted threads.
> Of course, this might be a pipe dream...

A lot of people have had that dream, and some of those refactoring
efforts have proven worthwhile.  But it's not a dream without costs;
your refactoring will conflict with other changes.  I don't think the
benefit here is high enough to pursue this edition of the dream.

Re: [PATCH 00/12] radix-tree: split out struct radix_tree_root out to

2017-10-10 Thread Matthew Wilcox

On Mon, Oct 09, 2017 at 01:10:01AM +0900, Masahiro Yamada wrote:
> Reducing the header dependency will help for speeding the kernel
> build, suppressing unnecessary recompile of objects during
> git-bisect'ing, etc.

Well, does it?  You could provide measurements showing before/after
time to compile, or time to recompile after touching a header file that
is included by radix-tree.h and not by radix-tree-root.h.

Look at the files included (never mind the transitively included files):

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

These are not exactly rare files to be included.  My guess is that most
of the files in the kernel end up depending on these files *anyway*, either
directly or through some path that isn't the radix tree.

Extended IDR API

2017-09-11 Thread Matthew Wilcox


I really don't like your new API.  I wish you'd discussed it before
merging it.  Here's my redesign.  Does anybody have a suggestion for
improvement?

We have a lovely new test-suite for the IDR (in tools/testing/radix-tree)
... when adding a new API, it's polite to update the test-suite too.
Do you have any plans to add test cases?

(Compile tested only; I'm at a conference.  Also, I didn't check the
kerneldoc because I don't have Sphinx installed on my laptop.)

>From ff45b2a6806cd0e4177c5a10f26c97999164c10c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawil...@microsoft.com>
Date: Mon, 11 Sep 2017 16:16:29 -0400
Subject: [PATCH] idr: Rewrite extended IDR API

 - Rename the API to be 'ul' for unsigned long instead of 'ext'.  This
   fits better with other usage in the Linux kernel.
 - idr_alloc() moves back to being a function instead of inline
 - idr_alloc_ul() takes 'nextid' as an in-out parameter like idr_get_next(),
   instead of having 'index' as an out-only parameter.
 - idr_alloc_ul() needs a __must_check to ensure that users don't look at
   the result without checking whether the function succeeded.
 - idr_alloc_ul() takes 'max' rather than 'end', or it is impossible to
   allocate the ULONG_MAX id.
 - idr_replace() can simply take an unsigned long parameter instead of
   an int.
 - idr_remove() and idr_find() are the same as idr_replace().
 - We don't need separate idr_get_free() and idr_get_free_ext().
 - Add kerneldoc for idr_alloc_ul().

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/idr.h| 75 +--
 include/linux/radix-tree.h | 17 +
 lib/idr.c  | 88 +-
 lib/radix-tree.c   |  2 +-
 net/sched/act_api.c| 22 ++--
 net/sched/cls_flower.c | 16 +
 6 files changed, 95 insertions(+), 125 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 7c3a365f7e12..90faf8279559 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -81,74 +81,22 @@ static inline void idr_set_cursor(struct idr *idr, unsigned 
int val)
 
 void idr_preload(gfp_t gfp_mask);
 
-int idr_alloc_cmn(struct idr *idr, void *ptr, unsigned long *index,
- unsigned long start, unsigned long end, gfp_t gfp,
- bool ext);
-
-/**
- * idr_alloc - allocate an id
- * @idr: idr handle
- * @ptr: pointer to be associated with the new id
- * @start: the minimum id (inclusive)
- * @end: the maximum id (exclusive)
- * @gfp: memory allocation flags
- *
- * Allocates an unused ID in the range [start, end).  Returns -ENOSPC
- * if there are no unused IDs in that range.
- *
- * Note that @end is treated as max when <= 0.  This is to always allow
- * using @start + N as @end as long as N is inside integer range.
- *
- * Simultaneous modifications to the @idr are not allowed and should be
- * prevented by the user, usually with a lock.  idr_alloc() may be called
- * concurrently with read-only accesses to the @idr, such as idr_find() and
- * idr_for_each_entry().
- */
-static inline int idr_alloc(struct idr *idr, void *ptr,
-   int start, int end, gfp_t gfp)
-{
-   unsigned long id;
-   int ret;
-
-   if (WARN_ON_ONCE(start < 0))
-   return -EINVAL;
-
-   ret = idr_alloc_cmn(idr, ptr, , start, end, gfp, false);
-
-   if (ret)
-   return ret;
-
-   return id;
-}
-
-static inline int idr_alloc_ext(struct idr *idr, void *ptr,
-   unsigned long *index,
-   unsigned long start,
-   unsigned long end,
-   gfp_t gfp)
-{
-   return idr_alloc_cmn(idr, ptr, index, start, end, gfp, true);
-}
-
+int idr_alloc(struct idr *, void *, int start, int end, gfp_t);
+int __must_check idr_alloc_ul(struct idr *, void *, unsigned long *nextid,
+   unsigned long max, gfp_t);
 int idr_alloc_cyclic(struct idr *, void *entry, int start, int end, gfp_t);
 int idr_for_each(const struct idr *,
 int (*fn)(int id, void *p, void *data), void *data);
 void *idr_get_next(struct idr *, int *nextid);
-void *idr_get_next_ext(struct idr *idr, unsigned long *nextid);
-void *idr_replace(struct idr *, void *, int id);
-void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id);
+void *idr_get_next_ul(struct idr *, unsigned long *nextid);
+void *idr_replace(struct idr *, void *, unsigned long id);
 void idr_destroy(struct idr *);
 
-static inline void *idr_remove_ext(struct idr *idr, unsigned long id)
+static inline void *idr_remove(struct idr *idr, unsigned long id)
 {
return radix_tree_delete_item(>idr_rt, id, NULL);
 }
 
-static inline void *idr_remove(struct idr *idr, int id)
-{
-   return idr_remove_ext(idr, id);
-}
-
 static inline void idr_init(struct idr *idr)
 {
INIT_RADIX_TREE(>idr_rt,

Re: in_irq_or_nmi()

2017-03-29 Thread Matthew Wilcox

On Wed, Mar 29, 2017 at 11:19:49AM +0200, Peter Zijlstra wrote:
> On Wed, Mar 29, 2017 at 10:59:28AM +0200, Jesper Dangaard Brouer wrote:
> > On Wed, 29 Mar 2017 10:12:19 +0200
> > Peter Zijlstra  wrote:
> > > No, that's horrible. Also, wth is this about? A memory allocator that
> > > needs in_nmi()? That sounds beyond broken.
> > 
> > It is the other way around. We want to exclude NMI and HARDIRQ from
> > using the per-cpu-pages (pcp) lists "order-0 cache" (they will
> > fall-through using the normal buddy allocator path).
> 
> Any in_nmi() code arriving at the allocator is broken. No need to fix
> the allocator.

That's demonstrably true.  You can't grab a spinlock in NMI code and
the first thing that happens if this in_irq_or_nmi() check fails is ...
spin_lock_irqsave(>lock, flags);
so this patch should just use in_irq().

(the concept of NMI code needing to allocate memory was blowing my mind
a little bit)

in_irq_or_nmi()

2017-03-27 Thread Matthew Wilcox

On Mon, Mar 27, 2017 at 05:15:00PM +0200, Jesper Dangaard Brouer wrote:
> And I also verified it worked:
> 
>   0.63 │   mov__preempt_count,%eax
>│ free_hot_cold_page():
>   1.25 │   test   $0x1f,%eax
>│ ↓ jne1e4
> 
> And this simplification also made the compiler change this into a
> unlikely branch, which is a micro-optimization (that I will leave up to
> the compiler).

Excellent!  That said, I think we should define in_irq_or_nmi() in
preempt.h, rather than hiding it in the memory allocator.  And since we're
doing that, we might as well make it look like the other definitions:

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 7eeceac52dea..af98c29abd9d 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -81,6 +81,7 @@
 #define in_interrupt() (irq_count())
 #define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
 #define in_nmi()   (preempt_count() & NMI_MASK)
+#define in_irq_or_nmi()(preempt_count() & (HARDIRQ_MASK | 
NMI_MASK))
 #define in_task()  (!(preempt_count() & \
   (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))

I think there are some genuine questions to be asked about the other
users of in_irq() whether they really want to use in_irq_or_nmi().
There's fewer than a hundred of them, so somebody sufficiently motivated
could take a look in a few days.

Re: Page allocator order-0 optimizations merged

2017-03-27 Thread Matthew Wilcox

On Mon, Mar 27, 2017 at 02:39:47PM +0200, Jesper Dangaard Brouer wrote:
>  
> +static __always_inline int in_irq_or_nmi(void)
> +{
> + return in_irq() || in_nmi();
> +// XXX: hoping compiler will optimize this (todo verify) into:
> +// #define in_irq_or_nmi()   (preempt_count() & (HARDIRQ_MASK | NMI_MASK))
> +
> + /* compiler was smart enough to only read __preempt_count once
> +  * but added two branches
> +asm code:
> + │   mov__preempt_count,%eax
> + │   test   $0xf,%eax// HARDIRQ_MASK: 0x000f
> + │┌──jne2a
> + ││  test   $0x10,%eax   // NMI_MASK: 0x0010
> + ││↓ je 3f
> + │ 2a:└─→mov%rbx,%rdi
> +
> +  */
> +}

To be fair, you told the compiler to do that with your use of fancy-pants ||
instead of optimisable |.  Try this instead:

static __always_inline int in_irq_or_nmi(void)
{
return in_irq() | in_nmi();
}

1770 :
1770:   65 8b 05 00 00 00 00mov%gs:0x0(%rip),%eax# 1777 

1773: R_X86_64_PC32 __preempt_count-0x4
#define in_nmi()(preempt_count() & NMI_MASK)
#define in_task()   (!(preempt_count() & \
   (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
static __always_inline int in_irq_or_nmi(void)
{
return in_irq() | in_nmi();
1777:   25 00 00 1f 00  and$0x1f,%eax
}
177c:   c3  retq   
177d:   0f 1f 00nopl   (%rax)

RE: [PATCH 1/2] include: Fix checkpatch whitespace error and warning

2017-02-12 Thread Matthew Wilcox

Don't worry about this file; I have a complete rewrite already queued.

> -Original Message-
> From: Tobin C. Harding [mailto:m...@tobin.cc]
> Sent: Saturday, February 11, 2017 9:58 PM
> To: David S . Miller <da...@davemloft.net>
> Cc: Matthew Wilcox <mawil...@microsoft.com>; netdev@vger.kernel.org;
> linux-ker...@vger.kernel.org; Tobin C. Harding <m...@tobin.cc>
> Subject: [PATCH 1/2] include: Fix checkpatch whitespace error and warning
> 
> This patch fixes two trivial whitespace messages (ERROR/WARNING).
> Fixes trailing whitespace ERROR and fixes space before tabs WARNING.
> 
> Signed-off-by: Tobin C. Harding <m...@tobin.cc>
> ---
>  include/linux/idr.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/idr.h b/include/linux/idr.h
> index 3c01b89..4a8de2f 100644
> --- a/include/linux/idr.h
> +++ b/include/linux/idr.h
> @@ -1,6 +1,6 @@
>  /*
>   * include/linux/idr.h
> - *
> + *
>   * 2002-10-18  written by Jim Houston jim.hous...@ccur.com
>   *   Copyright (C) 2002 by Concurrent Computer Corporation
>   *   Distributed under the GNU GPL license version 2.
> @@ -183,7 +183,7 @@ static inline void *idr_find(struct idr *idr, int id)
>   */
>  #define IDA_CHUNK_SIZE   128 /* 128 bytes per chunk */
>  #define IDA_BITMAP_LONGS (IDA_CHUNK_SIZE / sizeof(long) - 1)
> -#define IDA_BITMAP_BITS  (IDA_BITMAP_LONGS * sizeof(long) * 8)
> +#define IDA_BITMAP_BITS (IDA_BITMAP_LONGS * sizeof(long) * 8)
> 
>  struct ida_bitmap {
>   longnr_busy;
> --
> 2.7.4

Re: [Lsf] [Lsf-pc] [LSF/MM TOPIC] Generic page-pool recycle facility?

2016-04-11 Thread Matthew Wilcox

On Mon, Apr 11, 2016 at 02:08:27PM +0100, Mel Gorman wrote:
> On Mon, Apr 11, 2016 at 02:26:39PM +0200, Jesper Dangaard Brouer wrote:
> > On arch's like PowerPC, the DMA API is the bottleneck.  To workaround
> > the cost of DMA calls, NIC driver alloc large order (compound) pages.
> > (dma_map compound page, handout page-fragments for RX ring, and later
> > dma_unmap when last RX page-fragments is seen).
> 
> So, IMO only holding onto the DMA pages is all that is justified but not a
> recycle of order-0 pages built on top of the core allocator. For DMA pages,
> it would take a bit of legwork but the per-cpu allocator could be split
> and converted to hold arbitrary sized pages with a constructer/destructor
> to do the DMA coherency step when pages are taken from or handed back to
> the core allocator. I'm not volunteering to do that unfortunately but I
> estimate it'd be a few days work unless it needs to be per-CPU and NUMA
> aware in which case the memory footprint will be high.

Have "we" tried to accelerate the DMA calls in PowerPC?  For example, it
could hold onto a cache of recently used mappings and recycle them if that
still works.  It trades off a bit of security (a device can continue to DMA
after the memory should no longer be accessible to it) for speed, but then
so does the per-driver hack of keeping pages around still mapped.

Re: [PATCH] MAINTAINERS: remove Adam Fritzler, update his email address in other sources

2007-12-18 Thread Matthew Wilcox

On Mon, Dec 17, 2007 at 08:48:03PM -0800, Joe Perches wrote:
 diff --git a/MAINTAINERS b/MAINTAINERS
 index 9507b42..690f172 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
 @@ -3758,13 +3758,6 @@ W: 
 http://www.kernel.org/pub/linux/kernel/people/bunk/trivial/
  T:   git kernel.org:/pub/scm/linux/kernel/git/bunk/trivial.git
  S:   Maintained
  
 -TMS380 TOKEN-RING NETWORK DRIVER
 -P:   Adam Fritzler
 -M:   [EMAIL PROTECTED]
 -L:   [EMAIL PROTECTED]
 -W:   http://www.auk.cx/tms380tr/
 -S:   Maintained

That's pretty unfriendly.  Presumably the linux-tr list is still active
and would still help people with that driver.  So how about:

 TMS380 TOKEN-RING NETWORK DRIVER
-P:   Adam Fritzler
-M:   [EMAIL PROTECTED]
 L:   [EMAIL PROTECTED]
?W:?
-S:   Maintained
+S:   Orphan

(Dunno what to do about W:)

-- 
Intel are signing my paycheques ... these opinions are still mine
Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [BUG] New Kernel Bugs

2007-11-14 Thread Matthew Wilcox

On Wed, Nov 14, 2007 at 12:46:20AM -0700, Denys Vlasenko wrote:
 Finally they replied and asked to rediff it against their
 git tree. I did that and sent patches back. No reply since then.
 
 And mind you, the patch is not trying to do anything
 complex, it mostly moves code around, removes 'inline',
 adds 'const'. What should I think about it?

I'm waiting for an ACK/NAK from Hannes, the maintainer.  What should I
do?

-- 
Intel are signing my paycheques ... these opinions are still mine
Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [BUG] New Kernel Bugs

2007-11-13 Thread Matthew Wilcox

On Tue, Nov 13, 2007 at 12:50:08PM -0500, Mark Lord wrote:
 It's a 540MByte download over a slow link for everyone else.

Where do you get this number from?
$ du -sh .git/objects/pack/
249M.git/objects/pack/
$ du -sh .git/objects/
253M.git/objects/

ie about half what you claim.

-- 
Intel are signing my paycheques ... these opinions are still mine
Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [BUG] New Kernel Bugs

2007-11-13 Thread Matthew Wilcox

On Tue, Nov 13, 2007 at 01:43:53PM -0500, Mark Lord wrote:
 Matthew Wilcox wrote:
 ie about half what you claim.
 ..
 
 No, it's from earlier in this very thread:
 
 Adrian Bunk wrote:
 git clone \ 
 git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
 ..
 
 mkdir t
 cd t
 git clone 
 git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
 (wait half an hour)
 /usr/bin/du -s linux-2.6
 522732  linux-2.6

You're assuming that everything in linux-2.6 was downloaded; that's
not true.  Everything in linux-2.6/.git was downloaded; but then you do a
checkout which happens to approximately double the size of the linux-2.6
directory.  If you do git-clone -n, you'll get a closer estimate to the
size of the download.

I suppose git-clone should grow a -v option that it could pass to rsync
to let us find out how many bytes are actually transferred, but i'm
happy to go with 250MB as a close estimate to the amount of data to xfer.

When you compare it to the 60MB tarballs that are published, it's really
not that bad.

-- 
Intel are signing my paycheques ... these opinions are still mine
Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: oops with recent wireless-dev tree

2007-08-30 Thread Matthew Wilcox

On Thu, Aug 30, 2007 at 07:49:49AM -0700, Stephen Hemminger wrote:
   static int port_cost(struct net_device *dev)
   {
 if (dev-ethtool_ops-get_settings) {
  
   
   As far as I can figure out, dev-ethtool_ops is NULL and the crash
   happens while trying to derefernce ...-get_settings.
 
 Devices aren't required to have ethtool_ops. The code there used to
 call ethtool directly, and it would handle the error cases. I'll rollup
 a fix this morning.

Yep, clearly my fault; that should read:

if (dev-ethtool_ops  dev-ethtool_ops-get_settings) {

Since Stephen's already bagged fixing this, I shan't send a patch.
But if it includes something like the line above please add:

Acked-by: Matthew Wilcox [EMAIL PROTECTED]

-- 
Intel are signing my paycheques ... these opinions are still mine
Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: oops with recent wireless-dev tree

2007-08-30 Thread Matthew Wilcox

On Thu, Aug 30, 2007 at 05:01:31PM +0200, Johannes Berg wrote:
 Jochen had a patch: http://marc.info/?l=linux-wirelessm=118842715026614w=2

That's exactly the right patch, please add

Acked-by: Matthew Wilcox [EMAIL PROTECTED]

I just checked over the commit that introduced the bug, and I didn't
make the same mistake anywhere else.

-- 
Intel are signing my paycheques ... these opinions are still mine
Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] bridge: fix OOPS when bridging device without ethtool

2007-08-30 Thread Matthew Wilcox

On Thu, Aug 30, 2007 at 08:29:32AM -0700, Stephen Hemminger wrote:
 Bridge code calls ethtool to get speed. The conversion to using
 only ethtool_ops broke the case of devices without ethtool_ops.
 This is a new regression in 2.6.23.
 
 Rearranged the switch to a logical order, and use gcc initializer.
 
 Ps: speed should have been part of the network device structure from
 the start rather than burying it in ethtool.

Feel free to do the conversion ;-)  One of the things I like about the
ethtool framework is it gives us a way to take stuff out of the drivers
and put it in the midlayer without disturbing userspace.

 Signed-off-by: Stephen Hemminger [EMAIL PROTECTED]

Acked-by: Matthew Wilcox [EMAIL PROTECTED]

-- 
Intel are signing my paycheques ... these opinions are still mine
Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [2.6 patch] unexport dev_ethtool

2007-08-14 Thread Matthew Wilcox

On Tue, Aug 14, 2007 at 11:22:03PM +0200, Adrian Bunk wrote:
 This patch removes the no longer used EXPORT_SYMBOL(dev_ethtool).
 
 Signed-off-by: Adrian Bunk [EMAIL PROTECTED]
Acked-by: Matthew Wilcox [EMAIL PROTECTED]

-- 
Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] ethtool_perm_addr only has one implementation

2007-07-29 Thread Matthew Wilcox


All drivers implement ethtool get_perm_addr the same way -- by calling
the generic function.  So we can inline the generic function into the
caller and avoid going through the drivers.

Signed-off-by: Matthew Wilcox [EMAIL PROTECTED]

diff -u b/net/core/ethtool.c b/net/core/ethtool.c
--- b/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -95,18 +95,6 @@
return 0;
 }
 
-int ethtool_op_get_perm_addr(struct net_device *dev, struct ethtool_perm_addr 
*addr, u8 *data)
-{
-   unsigned char len = dev-addr_len;
-   if ( addr-size  len )
-   return -ETOOSMALL;
-
-   addr-size = len;
-   memcpy(data, dev-perm_addr, len);
-   return 0;
-}
-
-
 u32 ethtool_op_get_ufo(struct net_device *dev)
 {
return (dev-features  NETIF_F_UFO) != 0;
@@ -779,34 +767,20 @@
 static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr)
 {
struct ethtool_perm_addr epaddr;
-   u8 *data;
-   int ret;
 
-   if (!dev-ethtool_ops-get_perm_addr)
-   return -EOPNOTSUPP;
-
-   if (copy_from_user(epaddr,useraddr,sizeof(epaddr)))
+   if (copy_from_user(epaddr, useraddr, sizeof(epaddr)))
return -EFAULT;
 
-   data = kmalloc(epaddr.size, GFP_USER);
-   if (!data)
-   return -ENOMEM;
-
-   ret = dev-ethtool_ops-get_perm_addr(dev,epaddr,data);
-   if (ret)
-   return ret;
+   if (epaddr.size  dev-addr_len)
+   return -ETOOSMALL;
+   epaddr.size = dev-addr_len;
 
-   ret = -EFAULT;
if (copy_to_user(useraddr, epaddr, sizeof(epaddr)))
-   goto out;
+   return -EFAULT;
useraddr += sizeof(epaddr);
-   if (copy_to_user(useraddr, data, epaddr.size))
-   goto out;
-   ret = 0;
-
- out:
-   kfree(data);
-   return ret;
+   if (copy_to_user(useraddr, dev-perm_addr, epaddr.size))
+   return -EFAULT;
+   return 0;
 }
 
 /* The main entry point in this file.  Called from net/core/dev.c */
@@ -976,7 +950,6 @@
 
 EXPORT_SYMBOL(dev_ethtool);
 EXPORT_SYMBOL(ethtool_op_get_link);
-EXPORT_SYMBOL_GPL(ethtool_op_get_perm_addr);
 EXPORT_SYMBOL(ethtool_op_get_sg);
 EXPORT_SYMBOL(ethtool_op_get_tso);
 EXPORT_SYMBOL(ethtool_op_get_tx_csum);
only in patch2:
unchanged:
--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -2886,7 +2886,6 @@ static const struct ethtool_ops vortex_ethtool_ops = {
.set_settings   = vortex_set_settings,
.get_link   = ethtool_op_get_link,
.nway_reset = vortex_nway_reset,
-   .get_perm_addr  = ethtool_op_get_perm_addr,
 };
 
 #ifdef CONFIG_PCI
only in patch2:
unchanged:
--- a/drivers/net/8139cp.c
+++ b/drivers/net/8139cp.c
@@ -1578,7 +1578,6 @@ static const struct ethtool_ops cp_ethtool_ops = {
.set_wol= cp_set_wol,
.get_strings= cp_get_strings,
.get_ethtool_stats  = cp_get_ethtool_stats,
-   .get_perm_addr  = ethtool_op_get_perm_addr,
.get_eeprom_len = cp_get_eeprom_len,
.get_eeprom = cp_get_eeprom,
.set_eeprom = cp_set_eeprom,
only in patch2:
unchanged:
--- a/drivers/net/8139too.c
+++ b/drivers/net/8139too.c
@@ -2452,7 +2452,6 @@ static const struct ethtool_ops rtl8139_ethtool_ops = {
.get_strings= rtl8139_get_strings,
.get_stats_count= rtl8139_get_stats_count,
.get_ethtool_stats  = rtl8139_get_ethtool_stats,
-   .get_perm_addr  = ethtool_op_get_perm_addr,
 };
 
 static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
only in patch2:
unchanged:
--- a/drivers/net/ax88796.c
+++ b/drivers/net/ax88796.c
@@ -580,7 +580,6 @@ static const struct ethtool_ops ax_ethtool_ops = {
.set_settings   = ax_set_settings,
.nway_reset = ax_nway_reset,
.get_link   = ax_get_link,
-   .get_perm_addr  = ethtool_op_get_perm_addr,
 };
 
 /* setup code */
only in patch2:
unchanged:
--- a/drivers/net/b44.c
+++ b/drivers/net/b44.c
@@ -2033,7 +2033,6 @@ static const struct ethtool_ops b44_ethtool_ops = {
.get_strings= b44_get_strings,
.get_stats_count= b44_get_stats_count,
.get_ethtool_stats  = b44_get_ethtool_stats,
-   .get_perm_addr  = ethtool_op_get_perm_addr,
 };
 
 static int b44_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
only in patch2:
unchanged:
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -6269,7 +6269,6 @@ static const struct ethtool_ops bnx2_ethtool_ops = {
.phys_id= bnx2_phys_id,
.get_stats_count= bnx2_get_stats_count,
.get_ethtool_stats  = bnx2_get_ethtool_stats,
-   .get_perm_addr  = ethtool_op_get_perm_addr,
 };
 
 /* Called with rtnl_lock */
only in patch2:
unchanged:
--- a/drivers/net/cxgb3/cxgb3_main.c
+++ b

Re: [PATCH] Rewrite e100_phys_id

2006-11-07 Thread Matthew Wilcox

On Tue, Nov 07, 2006 at 10:33:14AM -0800, Auke Kok wrote:
 Matthew Wilcox wrote:
 Tested on the internal interface of an HP Integrity rx2600.
 
 bad news, it's completely hosed. The adapter does some indistinguishable 
 blinking for a second, then stops blinking alltogether.

Weird.  I tested it on the only e100 I have access to, and it worked.
I've just reviewed the patch you quoted below, and I don't see what the
problem is.  

I wonder if this is wrong:

 -mdio_write(netdev, nic-mii.phy_id, MII_LED_CONTROL, 0);
 +mdio_write(netdev, nic-mii.phy_id, MII_LED_CONTROL, led_off);

but everything else seems pretty straight-forward.

 I might revert the code to the old situation. I guess I should have tested 
 it initially right away.
 
 I'm not even going to touch the e1000 patch for now ;)
 
 Auke
 
 
 Signed-off-by: Matthew Wilcox [EMAIL PROTECTED]
 
 diff --git a/drivers/net/e100.c b/drivers/net/e100.c
 index a3a08a5..aade1e9 100644
 --- a/drivers/net/e100.c
 +++ b/drivers/net/e100.c
 @@ -556,7 +556,6 @@ struct nic {
  struct params params;
  struct net_device_stats net_stats;
  struct timer_list watchdog;
 -struct timer_list blink_timer;
  struct mii_if_info mii;
  struct work_struct tx_timeout_task;
  enum loopback loopback;
 @@ -581,7 +580,6 @@ struct nic {
  u32 rx_over_length_errors;
  
  u8 rev_id;
 -u16 leds;
  u16 eeprom_wc;
  u16 eeprom[256];
  spinlock_t mdio_lock;
 @@ -2168,23 +2166,6 @@ err_clean_rx:
  return err;
  }
  
 -#define MII_LED_CONTROL 0x1B
 -static void e100_blink_led(unsigned long data)
 -{
 -struct nic *nic = (struct nic *)data;
 -enum led_state {
 -led_on = 0x01,
 -led_off= 0x04,
 -led_on_559 = 0x05,
 -led_on_557 = 0x07,
 -};
 -
 -nic-leds = (nic-leds  led_on) ? led_off :
 -(nic-mac  mac_82559_D101M) ? led_on_557 : led_on_559;
 -mdio_write(nic-netdev, nic-mii.phy_id, MII_LED_CONTROL, nic-leds);
 -mod_timer(nic-blink_timer, jiffies + HZ / 4);
 -}
 -
  static int e100_get_settings(struct net_device *netdev, struct 
  ethtool_cmd *cmd)
  {
  struct nic *nic = netdev_priv(netdev);
 @@ -2411,16 +2392,32 @@ static void e100_diag_test(struct net_de
  msleep_interruptible(4 * 1000);
  }
  
 +#define MII_LED_CONTROL 0x1B
  static int e100_phys_id(struct net_device *netdev, u32 data)
  {
  struct nic *nic = netdev_priv(netdev);
 +int i;
 +
 +enum led_state {
 +led_off= 0x04,
 +led_on_559 = 0x05,
 +led_on_557 = 0x07,
 +};
 +u16 leds = led_off;
 +
 +if (data == 0)
 +data = 2;
 +
 +for (i = 0; i  (data * 2); i++) {
 +leds = (leds == led_off) ?
 +(nic-mac  mac_82559_D101M) ? led_on_557 : 
 led_on_559 :
 +led_off;
 +mdio_write(nic-netdev, nic-mii.phy_id, MII_LED_CONTROL, 
 leds);
 +if (msleep_interruptible(500))
 +break;
 +}
  
 -if(!data || data  (u32)(MAX_SCHEDULE_TIMEOUT / HZ))
 -data = (u32)(MAX_SCHEDULE_TIMEOUT / HZ);
 -mod_timer(nic-blink_timer, jiffies);
 -msleep_interruptible(data * 1000);
 -del_timer_sync(nic-blink_timer);
 -mdio_write(netdev, nic-mii.phy_id, MII_LED_CONTROL, 0);
 +mdio_write(netdev, nic-mii.phy_id, MII_LED_CONTROL, led_off);
  
  return 0;
  }
 @@ -2633,9 +2630,6 @@ #endif
  init_timer(nic-watchdog);
  nic-watchdog.function = e100_watchdog;
  nic-watchdog.data = (unsigned long)nic;
 -init_timer(nic-blink_timer);
 -nic-blink_timer.function = e100_blink_led;
 -nic-blink_timer.data = (unsigned long)nic;
  
  INIT_WORK(nic-tx_timeout_task,
  (void (*)(void *))e100_tx_timeout_task, netdev);
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Rewrite e100_phys_id

2006-10-26 Thread Matthew Wilcox


The motivator for this was to fix the sparse warning:

drivers/net/e100.c:2418:48: warning: cast truncates bits from constant
value (83126e978d4fdf becomes 978d4fdf)
drivers/net/e100.c:2419:37: warning: cast truncates bits from constant
value (83126e978d4fdf becomes 978d4fdf)

Initially, I tried a quick fix, but when it ran into difficulties, I
looked at tg3.c to see how it does it.  I liked their way better, so I
rewrote e100.c to be similar.  It shaves ~700 bytes off the size of the
driver, and a few bytes off the size of struct nic, so I think it's a
win all round.  Tested on the internal interface of an HP Integrity rx2600.

Signed-off-by: Matthew Wilcox [EMAIL PROTECTED]

diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index a3a08a5..aade1e9 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -556,7 +556,6 @@ struct nic {
struct params params;
struct net_device_stats net_stats;
struct timer_list watchdog;
-   struct timer_list blink_timer;
struct mii_if_info mii;
struct work_struct tx_timeout_task;
enum loopback loopback;
@@ -581,7 +580,6 @@ struct nic {
u32 rx_over_length_errors;
 
u8 rev_id;
-   u16 leds;
u16 eeprom_wc;
u16 eeprom[256];
spinlock_t mdio_lock;
@@ -2168,23 +2166,6 @@ err_clean_rx:
return err;
 }
 
-#define MII_LED_CONTROL0x1B
-static void e100_blink_led(unsigned long data)
-{
-   struct nic *nic = (struct nic *)data;
-   enum led_state {
-   led_on = 0x01,
-   led_off= 0x04,
-   led_on_559 = 0x05,
-   led_on_557 = 0x07,
-   };
-
-   nic-leds = (nic-leds  led_on) ? led_off :
-   (nic-mac  mac_82559_D101M) ? led_on_557 : led_on_559;
-   mdio_write(nic-netdev, nic-mii.phy_id, MII_LED_CONTROL, nic-leds);
-   mod_timer(nic-blink_timer, jiffies + HZ / 4);
-}
-
 static int e100_get_settings(struct net_device *netdev, struct ethtool_cmd 
*cmd)
 {
struct nic *nic = netdev_priv(netdev);
@@ -2411,16 +2392,32 @@ static void e100_diag_test(struct net_de
msleep_interruptible(4 * 1000);
 }
 
+#define MII_LED_CONTROL0x1B
 static int e100_phys_id(struct net_device *netdev, u32 data)
 {
struct nic *nic = netdev_priv(netdev);
+   int i;
+
+   enum led_state {
+   led_off= 0x04,
+   led_on_559 = 0x05,
+   led_on_557 = 0x07,
+   };
+   u16 leds = led_off;
+
+   if (data == 0)
+   data = 2;
+
+   for (i = 0; i  (data * 2); i++) {
+   leds = (leds == led_off) ?
+   (nic-mac  mac_82559_D101M) ? led_on_557 : led_on_559 :
+   led_off;
+   mdio_write(nic-netdev, nic-mii.phy_id, MII_LED_CONTROL, leds);
+   if (msleep_interruptible(500))
+   break;
+   }
 
-   if(!data || data  (u32)(MAX_SCHEDULE_TIMEOUT / HZ))
-   data = (u32)(MAX_SCHEDULE_TIMEOUT / HZ);
-   mod_timer(nic-blink_timer, jiffies);
-   msleep_interruptible(data * 1000);
-   del_timer_sync(nic-blink_timer);
-   mdio_write(netdev, nic-mii.phy_id, MII_LED_CONTROL, 0);
+   mdio_write(netdev, nic-mii.phy_id, MII_LED_CONTROL, led_off);
 
return 0;
 }
@@ -2633,9 +2630,6 @@ #endif
init_timer(nic-watchdog);
nic-watchdog.function = e100_watchdog;
nic-watchdog.data = (unsigned long)nic;
-   init_timer(nic-blink_timer);
-   nic-blink_timer.function = e100_blink_led;
-   nic-blink_timer.data = (unsigned long)nic;
 
INIT_WORK(nic-tx_timeout_task,
(void (*)(void *))e100_tx_timeout_task, netdev);
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

1 2 >

1 - 100 of 120 matches

Mail list logo