Re: [PATCH v4 3/8] MIPS: Octeon: Add a global resource manager.

2017-11-30 Thread Philippe Ombredanne
Carlos,

On Thu, Nov 30, 2017 at 11:53 PM, James Hogan  wrote:
> On Tue, Nov 28, 2017 at 04:55:35PM -0800, David Daney wrote:
>> From: Carlos Munoz 
>>
>> Add a global resource manager to manage tagged pointers within
>> bootmem allocated memory. This is used by various functional
>> blocks in the Octeon core like the FPA, Ethernet nexus, etc.
>>
>> Signed-off-by: Carlos Munoz 
>> Signed-off-by: Steven J. Hill 
>> Signed-off-by: David Daney 
>> ---
>>  arch/mips/cavium-octeon/Makefile   |   3 +-
>>  arch/mips/cavium-octeon/resource-mgr.c | 371 
>> +
>>  arch/mips/include/asm/octeon/octeon.h  |  18 ++
>>  3 files changed, 391 insertions(+), 1 deletion(-)
>>  create mode 100644 arch/mips/cavium-octeon/resource-mgr.c
>>
>> diff --git a/arch/mips/cavium-octeon/Makefile 
>> b/arch/mips/cavium-octeon/Makefile
>> index 7c02e542959a..0a299ab8719f 100644
>> --- a/arch/mips/cavium-octeon/Makefile
>> +++ b/arch/mips/cavium-octeon/Makefile
>> @@ -9,7 +9,8 @@
>>  # Copyright (C) 2005-2009 Cavium Networks
>>  #
>>
>> -obj-y := cpu.o setup.o octeon-platform.o octeon-irq.o csrc-octeon.o
>> +obj-y := cpu.o setup.o octeon-platform.o octeon-irq.o csrc-octeon.o \
>> +  resource-mgr.o
>
> Maybe put that on a separate line like below.
>
>>  obj-y += dma-octeon.o
>>  obj-y += octeon-memcpy.o
>>  obj-y += executive/
>> diff --git a/arch/mips/cavium-octeon/resource-mgr.c 
>> b/arch/mips/cavium-octeon/resource-mgr.c
>> new file mode 100644
>> index ..ca25fa953402
>> --- /dev/null
>> +++ b/arch/mips/cavium-octeon/resource-mgr.c
>> @@ -0,0 +1,371 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Resource manager for Octeon.
>> + *
>> + * This file is subject to the terms and conditions of the GNU General 
>> Public
>> + * License.  See the file "COPYING" in the main directory of this archive
>> + * for more details.
>> + *
>> + * Copyright (C) 2017 Cavium, Inc.
>> + */

Since you nicely included an SPDX id, you would not need the
boilerplate anymore. e.g. these can go alright?

>> + * This file is subject to the terms and conditions of the GNU General 
>> Public
>> + * License.  See the file "COPYING" in the main directory of this archive
>> + * for more details.

-- 
Cordially
Philippe Ombredanne


[PATCH 1/1] timecounter: Make cyclecounter struct part of timecounter struct

2017-11-30 Thread Sagar Arun Kamble
There is no real need for the users of timecounters to define cyclecounter
and timecounter variables separately. Since timecounter will always be
based on cyclecounter, have cyclecounter struct as member of timecounter
struct.

Suggested-by: Chris Wilson 
Signed-off-by: Sagar Arun Kamble 
Cc: Chris Wilson 
Cc: John Stultz 
Cc: Thomas Gleixner 
Cc: Stephen Boyd 
Cc: linux-ker...@vger.kernel.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: netdev@vger.kernel.org
Cc: intel-wired-...@lists.osuosl.org
Cc: linux-r...@vger.kernel.org
Cc: alsa-de...@alsa-project.org
Cc: kvm...@lists.cs.columbia.edu
---
 arch/microblaze/kernel/timer.c | 20 ++--
 drivers/clocksource/arm_arch_timer.c   | 19 ++--
 drivers/net/ethernet/amd/xgbe/xgbe-dev.c   |  3 +-
 drivers/net/ethernet/amd/xgbe/xgbe-ptp.c   |  9 +++---
 drivers/net/ethernet/amd/xgbe/xgbe.h   |  1 -
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h|  1 -
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c   | 20 ++--
 drivers/net/ethernet/freescale/fec.h   |  1 -
 drivers/net/ethernet/freescale/fec_ptp.c   | 30 +-
 drivers/net/ethernet/intel/e1000e/e1000.h  |  1 -
 drivers/net/ethernet/intel/e1000e/netdev.c | 27 
 drivers/net/ethernet/intel/e1000e/ptp.c|  2 +-
 drivers/net/ethernet/intel/igb/igb.h   |  1 -
 drivers/net/ethernet/intel/igb/igb_ptp.c   | 25 ---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h   |  1 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c   | 17 +-
 drivers/net/ethernet/mellanox/mlx4/en_clock.c  | 28 -
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  1 -
 .../net/ethernet/mellanox/mlx5/core/lib/clock.c| 34 ++--
 drivers/net/ethernet/qlogic/qede/qede_ptp.c| 20 ++--
 drivers/net/ethernet/ti/cpts.c | 36 --
 drivers/net/ethernet/ti/cpts.h |  1 -
 include/linux/mlx5/driver.h|  1 -
 include/linux/timecounter.h|  4 +--
 include/sound/hdaudio.h|  1 -
 kernel/time/timecounter.c  | 28 -
 sound/hda/hdac_stream.c|  7 +++--
 virt/kvm/arm/arch_timer.c  |  6 ++--
 28 files changed, 163 insertions(+), 182 deletions(-)

diff --git a/arch/microblaze/kernel/timer.c b/arch/microblaze/kernel/timer.c
index 7de941c..b7f89e9 100644
--- a/arch/microblaze/kernel/timer.c
+++ b/arch/microblaze/kernel/timer.c
@@ -199,27 +199,25 @@ static u64 xilinx_read(struct clocksource *cs)
return (u64)xilinx_clock_read();
 }
 
-static struct timecounter xilinx_tc = {
-   .cc = NULL,
-};
-
 static u64 xilinx_cc_read(const struct cyclecounter *cc)
 {
return xilinx_read(NULL);
 }
 
-static struct cyclecounter xilinx_cc = {
-   .read = xilinx_cc_read,
-   .mask = CLOCKSOURCE_MASK(32),
-   .shift = 8,
+static struct timecounter xilinx_tc = {
+   .cc.read = xilinx_cc_read,
+   .cc.mask = CLOCKSOURCE_MASK(32),
+   .cc.mult = 0,
+   .cc.shift = 8,
 };
 
 static int __init init_xilinx_timecounter(void)
 {
-   xilinx_cc.mult = div_sc(timer_clock_freq, NSEC_PER_SEC,
-   xilinx_cc.shift);
+   struct cyclecounter *cc = _tc.cc;
+
+   cc->mult = div_sc(timer_clock_freq, NSEC_PER_SEC, cc->shift);
 
-   timecounter_init(_tc, _cc, sched_clock());
+   timecounter_init(_tc, sched_clock());
 
return 0;
 }
diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 57cb2f0..31543e5 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -179,11 +179,6 @@ static u64 arch_counter_read_cc(const struct cyclecounter 
*cc)
.flags  = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
-static struct cyclecounter cyclecounter __ro_after_init = {
-   .read   = arch_counter_read_cc,
-   .mask   = CLOCKSOURCE_MASK(56),
-};
-
 struct ate_acpi_oem_info {
char oem_id[ACPI_OEM_ID_SIZE + 1];
char oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
@@ -915,7 +910,10 @@ static u64 arch_counter_get_cntvct_mem(void)
return ((u64) vct_hi << 32) | vct_lo;
 }
 
-static struct arch_timer_kvm_info arch_timer_kvm_info;
+static struct arch_timer_kvm_info arch_timer_kvm_info = {
+   .timecounter.cc.read = arch_counter_read_cc,
+   .timecounter.cc.mask = CLOCKSOURCE_MASK(56),
+};
 
 struct arch_timer_kvm_info *arch_timer_get_kvm_info(void)
 {
@@ -925,6 +923,7 @@ struct arch_timer_kvm_info *arch_timer_get_kvm_info(void)
 static void __init arch_counter_register(unsigned type)
 {
u64 start_count;
+   

Re: Creating cyclecounter and lock member in timecounter structure [ Was Re: [RFC 1/4] drm/i915/perf: Add support to correlate GPU timestamp with system time]

2017-11-30 Thread Sagar Arun Kamble



On 12/1/2017 2:33 AM, Saeed Mahameed wrote:

On Mon, Nov 27, 2017 at 2:05 AM, Sagar Arun Kamble
 wrote:


On 11/24/2017 7:01 PM, Thomas Gleixner wrote:

On Fri, 24 Nov 2017, Sagar Arun Kamble wrote:

On 11/24/2017 12:29 AM, Thomas Gleixner wrote:

On Thu, 23 Nov 2017, Sagar Arun Kamble wrote:

We needed inputs on possible optimization that can be done to
timecounter/cyclecounter structures/usage.
This mail is in response to review of patch
https://patchwork.freedesktop.org/patch/188448/.

As Chris's observation below, about dozen of timecounter users in the
kernel
have below structures
defined individually:

spinlock_t lock;
struct cyclecounter cc;
struct timecounter tc;

Can we move lock and cc to tc? That way it will be convenient.
Also it will allow unifying the locking/overflow watchdog handling
across
all
drivers.

Looks like none of the timecounter usage sites has a real need to
separate
timecounter and cyclecounter.

Yes. Will share patch for this change.


The lock is a different question. The locking of the various drivers
differs and I have no idea how you want to handle that. Just sticking
the
lock into the datastructure and then not making use of it in the
timercounter code and leave it to the callsites does not make sense.

Most of the locks are held around timecounter_read. In some instances it
is held when cyclecounter is updated standalone or is updated along with
timecounter calls.  Was thinking if we move the lock in timecounter
functions, drivers just have to do locking around its operations on
cyclecounter. But then another problem I see is there are variation of
locking calls like lock_irqsave, lock_bh, write_lock_irqsave (some using
rwlock_t). Should this all locking be left to driver only then?

You could have the lock in the struct and protect the inner workings in
the
related core functions.

That might remove locking requirements from some of the callers and the
others still have their own thing around it.


For drivers having static/fixed cyclecounter, we can rely only on lock
inside timecounter.
Most of the network drivers update cyclecounter at runtime and they will
have to rely on two locks if
we add one to timecounter. This may not be efficient for them. Also the lock
in timecounter has to be less restrictive (may be seqlock) I guess.

Cc'd Mellanox list for inputs on this.

I have started feeling that the current approach of drivers managing the
locks is the right one so better leave the
lock out of timecounter.


I agree here,

In mlx5 we rely on our own read/write lock to serialize access to
mlx5_clock struct (mlx5 timecounter and cyclecounter).
the access is not as simple as
lock()
call time_counter_API
unlock()

Sometimes we also explicitly update/adjust timecycles counters with
mlx5 specific calculations after we read the timecounter all inside
our lock.
e.g.
@mlx5_ptp_adjfreq()

 write_lock_irqsave(>lock, flags);
 timecounter_read(>tc);
 clock->cycles.mult = neg_adj ? clock->nominal_c_mult - diff :
clock->nominal_c_mult + diff;
 write_unlock_irqrestore(>lock, flags);

So i don't think it will be a simple task to have a generic thread
safe timecounter API, without the need to specifically adjust it for
all driver use-cases.
Also as said above, in runtime it is not obvious in which context the
timecounter will be accessed irq/soft irq/user.

let's keep it as is, and let the driver decide which locking scheme is
most suitable for it.


Yes. Thanks for your inputs Saeed.

Regards
Sagar



Thanks,
Saeed.


Thanks,

 tglx


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html




Re: KASAN: stack-out-of-bounds Read in xfrm_state_find (3)

2017-11-30 Thread Steffen Klassert
On Wed, Nov 22, 2017 at 08:05:00AM -0800, syzbot wrote:
> syzkaller has found reproducer for the following crash on
> 0c86a6bd85ff0629cd2c5141027fc1c8bb6cde9c
> git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/master
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached
> Raw console output is attached.
> C reproducer is attached
> syzkaller reproducer is attached. See https://goo.gl/kgGztJ
> for information about syzkaller reproducers
> 
> 
> BUG: KASAN: stack-out-of-bounds in xfrm_state_find+0x30fc/0x3230
> net/xfrm/xfrm_state.c:1051
> Read of size 4 at addr 8801ccaa7af8 by task syzkaller231684/3045

The patch below should fix this. I plan to apply it to the ipsec tree
after some advanced testing.

Subject: [PATCH RFC] xfrm: Fix stack-out-of-bounds with misconfigured transport
 mode policies.

On policies with a transport mode template, we pass the addresses
from the flowi to xfrm_state_find(), assuming that the IP addresses
(and address family) don't change during transformation.

Unfortunately our policy template validation is not strict enough.
It is possible to configure policies with transport mode template
where the address family of the template does not match the selectors
address family. This lead to stack-out-of-bound reads because
we compare arddesses of the wrong family. Fix this by refusing
such a configuration, address family can not change on transport
mode.

We use the assumption that, on transport mode, the first templates
address family must match the address family of the policy selector.
Subsequent transport mode templates must mach the address family of
the previous template.

Reported-by: syzbot 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_user.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 983b0233767b..57ad016ae675 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1419,11 +1419,14 @@ static void copy_templates(struct xfrm_policy *xp, 
struct xfrm_user_tmpl *ut,
 
 static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
 {
+   u16 prev_family;
int i;
 
if (nr > XFRM_MAX_DEPTH)
return -EINVAL;
 
+   prev_family = family;
+
for (i = 0; i < nr; i++) {
/* We never validated the ut->family value, so many
 * applications simply leave it at zero.  The check was
@@ -1435,6 +1438,12 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl 
*ut, u16 family)
if (!ut[i].family)
ut[i].family = family;
 
+   if ((ut[i].mode == XFRM_MODE_TRANSPORT) &&
+   (ut[i].family != prev_family))
+   return -EINVAL;
+
+   prev_family = ut[i].family;
+
switch (ut[i].family) {
case AF_INET:
break;
-- 
2.14.1



With All Due Respect!!!

2017-11-30 Thread Mrs.Louisa Benicio
My Sincere Greetings,

I am (MRS.LOUISA BENICIO); I have decided to donate what I have to you
/ Motherless babies/ Less privileged/ Churches/ Widows' because I am
dying and diagnosed for cancer for two years now. I have been touched
by Almighty God to donate from what I have inherited from my late
husband to you for good work of Merciful God. I have asked Almighty
God to forgive me and believe he has because he is a Merciful God I
will be going in for another surgery soonest.

I found your email address through random search from internet and I
decided to contact you and to will/donate the sum of ($5.5million
Dollars) to you for the good work of God Almighty, and also to help
the motherless and less privilege and also forth assistance of the
widows.

I wish you all the best and May Almighty God blesses you abundantly,
and please uses the funds judiciously and always extends the good work
to others. As soon you get back to me, I shall give you info on what I
need from you then you will contact the bank and tell them I have
willed those properties to you by quoting my personal file routing and
account information. And I have also notified the bank that I am
willing that properties to you for a good, effective and prudent work.
I know I don't know you but I have been directed to do this by God
Almighty.

Please if you would be able to use the funds for the work of humanity
as I have stated hear to fulfilled my late husband wishes  kindly
reply back to me through this email address
(mrslouisabeni...@yahoo.com) for more details.

Yours Faithfully,
MRS.LOUISA BENICIO.


Re: [RFC PATCH] net_sched: bulk free tcf_block

2017-11-30 Thread Cong Wang
On Wed, Nov 29, 2017 at 6:25 AM, Paolo Abeni  wrote:
> Currently deleting qdisc with a large number of children and filters
> can take a lot of time:
>
> tc qdisc add dev lo root htb
> for I in `seq 1 1000`; do
> tc class add dev lo parent 1: classid 1:$I htb rate 100kbit
> tc qdisc add dev lo parent 1:$I handle $((I + 1)): htb
> for J in `seq 1 10`; do
> tc filter add dev lo parent $((I + 1)): u32 match ip src 
> 1.1.1.$J
> done
> done
> time tc qdisc del dev root
>
> real0m54.764s
> user0m0.023s
> sys 0m0.000s
>
> This is due to the multiple rcu_barrier() calls, one for each tcf_block
> freed, invoked with the rtnl lock held. Most other network related
> tasks will block in this timespan.

Yeah, Eric pointed out this too and I already had an idea to cure
this.

As I already mentioned before, my idea is to refcnt the tcf block
so that we don't need to worry about which is the last. Something
like the attached patch below, note it is PoC _only_, not even
compiled yet. And I am not 100% sure it works either, I will look
deeper tomorrow.

Thanks.
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 65d0d25f2648..b051c519fd48 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -279,6 +279,7 @@ struct tcf_block {
struct Qdisc *q;
struct list_head cb_list;
struct work_struct work;
+   unsigned int nr_chains;
 };
 
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index ddcf04b4ab43..da74b311f09e 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -190,6 +190,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block 
*block,
return NULL;
list_add_tail(>list, >chain_list);
chain->block = block;
+   block->nr_chains++;
chain->index = chain_index;
chain->refcnt = 1;
return chain;
@@ -218,8 +219,12 @@ static void tcf_chain_flush(struct tcf_chain *chain)
 
 static void tcf_chain_destroy(struct tcf_chain *chain)
 {
+   struct tcf_block *block = chain->block;
+
list_del(>list);
kfree(chain);
+   if (!--block->nr_chains)
+   kfree(block);
 }
 
 static void tcf_chain_hold(struct tcf_chain *chain)
@@ -341,7 +346,6 @@ static void tcf_block_put_final(struct work_struct *work)
list_for_each_entry_safe(chain, tmp, >chain_list, list)
tcf_chain_put(chain);
rtnl_unlock();
-   kfree(block);
 }
 
 /* XXX: Standalone actions are not allowed to jump to any chain, and bound
@@ -365,11 +369,6 @@ void tcf_block_put_ext(struct tcf_block *block, struct 
Qdisc *q,
tcf_block_offload_unbind(block, q, ei);
 
INIT_WORK(>work, tcf_block_put_final);
-   /* Wait for existing RCU callbacks to cool down, make sure their works
-* have been queued before this. We can not flush pending works here
-* because we are holding the RTNL lock.
-*/
-   rcu_barrier();
tcf_queue_work(>work);
 }
 EXPORT_SYMBOL(tcf_block_put_ext);


Re: [PATCH 1/3] vhost: fix skb leak in handle_rx()

2017-11-30 Thread Jason Wang



On 2017年12月01日 13:54, w...@redhat.com wrote:

From: Wei Xu 

Matthew found a roughly 40% tcp throughput regression with commit
c67df11f(vhost_net: try batch dequing from skb array) as discussed
in the following thread:
https://www.mail-archive.com/netdev@vger.kernel.org/msg187936.html

Eventually we figured out that it was a skb leak in handle_rx()
when sending packets to the VM. This usually happens when a guest
can not drain out vq as fast as vhost fills in, afterwards it sets
off the traffic jam and leaks skb(s) which occurs as no headcount
to send on the vq from vhost side.

This can be avoided by making sure we have got enough headcount
before actually consuming a skb from the batched rx array while
transmitting, which is simply done by moving checking the zero
headcount a bit ahead.

Signed-off-by: Wei Xu 
Reported-by: Matthew Rosato 
---
  drivers/vhost/net.c | 20 ++--
  1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8d626d7..c7bdeb6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -778,16 +778,6 @@ static void handle_rx(struct vhost_net *net)
/* On error, stop handling until the next kick. */
if (unlikely(headcount < 0))
goto out;
-   if (nvq->rx_array)
-   msg.msg_control = vhost_net_buf_consume(>rxq);
-   /* On overrun, truncate and discard */
-   if (unlikely(headcount > UIO_MAXIOV)) {
-   iov_iter_init(_iter, READ, vq->iov, 1, 1);
-   err = sock->ops->recvmsg(sock, ,
-1, MSG_DONTWAIT | MSG_TRUNC);
-   pr_debug("Discarded rx packet: len %zd\n", sock_len);
-   continue;
-   }
/* OK, now we need to know about added descriptors. */
if (!headcount) {
if (unlikely(vhost_enable_notify(>dev, vq))) {
@@ -800,6 +790,16 @@ static void handle_rx(struct vhost_net *net)
 * they refilled. */
goto out;
}
+   if (nvq->rx_array)
+   msg.msg_control = vhost_net_buf_consume(>rxq);
+   /* On overrun, truncate and discard */
+   if (unlikely(headcount > UIO_MAXIOV)) {
+   iov_iter_init(_iter, READ, vq->iov, 1, 1);
+   err = sock->ops->recvmsg(sock, ,
+1, MSG_DONTWAIT | MSG_TRUNC);
+   pr_debug("Discarded rx packet: len %zd\n", sock_len);
+   continue;
+   }
/* We don't need to be notified again. */
iov_iter_init(_iter, READ, vq->iov, in, vhost_len);
fixup = msg.msg_iter;


I suggest to reorder this patch to 3/3.

Thanks


Re: [PATCH 3/3] tap: free skb if flags error

2017-11-30 Thread Jason Wang



On 2017年12月01日 13:54, w...@redhat.com wrote:

From: Wei Xu 

tap_recvmsg() supports accepting skb by msg_control after
commit 3b4ba04acca8 ("tap: support receiving skb from msg_control"),
the skb if presented should be freed within the function, otherwise
it would be leaked.

Signed-off-by: Wei Xu 
Reported-by: Matthew Rosato 
---
  drivers/net/tap.c | 6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index e9489b8..1c66b75 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1154,9 +1154,13 @@ static int tap_recvmsg(struct socket *sock, struct 
msghdr *m,
   size_t total_len, int flags)
  {
struct tap_queue *q = container_of(sock, struct tap_queue, sock);
+   struct sk_buff *skb = m->msg_control;
int ret;
-   if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
+   if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) {
+   if (skb)
+   kfree_skb(skb);
return -EINVAL;
+   }
ret = tap_do_read(q, >msg_iter, flags & MSG_DONTWAIT,
  m->msg_control);


Need to deal with iov_iterator_count() == 0.

Thanks


if (ret > total_len) {




Re: [PATCH net v2 2/3] xfrm: Add an activate() offload dev op

2017-11-30 Thread Steffen Klassert
On Tue, Nov 28, 2017 at 07:55:41PM +0200, av...@mellanox.com wrote:
> From: Aviv Heller 
> 
> Adding the state to the offload device prior to replay init in
> xfrm_state_construct() will result in NULL dereference if a matching
> ESP packet is received in between.
> 
> In order to inhibit driver offload logic from processing the state's
> packets prior to the xfrm_state object being completely initialized and
> added to the SADBs, a new activate() operation was added to inform the
> driver the aforementioned conditions have been met.

We discussed this already some time ago, and I still think that
we should fix this by setting XFRM_STATE_VALID only after the
state is fully initialized.



Re: [PATCH 2/3] tun: free skb in early errors

2017-11-30 Thread Jason Wang



On 2017年12月01日 13:54, w...@redhat.com wrote:

From: Wei Xu 

tun_recvmsg() supports accepting skb by msg_control after
commit ac77cfd4258f ("tun: support receiving skb through msg_control"),
the skb if presented should be freed within the function, otherwise it
would be leaked.

Signed-off-by: Wei Xu 
Reported-by: Matthew Rosato 
---
  drivers/net/tun.c | 14 +++---
  1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 6a7bde9..5563430 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2067,14 +2067,17 @@ static int tun_recvmsg(struct socket *sock, struct 
msghdr *m, size_t total_len,
  {
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun = tun_get(tfile);
+   struct sk_buff *skb = m->msg_control;
int ret;
  
-	if (!tun)

-   return -EBADFD;
+   if (!tun) {
+   ret = -EBADFD;
+   goto out_free_skb;


Unfortunately, you can't to there since tun is NULL.



+   }
  
  	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {

ret = -EINVAL;
-   goto out;
+   goto out_free_skb;
}
if (flags & MSG_ERRQUEUE) {
ret = sock_recv_errqueue(sock->sk, m, total_len,
@@ -2087,6 +2090,11 @@ static int tun_recvmsg(struct socket *sock, struct 
msghdr *m, size_t total_len,
m->msg_flags |= MSG_TRUNC;
ret = flags & MSG_TRUNC ? ret : total_len;
}
+   goto out;


We usually don't use goto in the case of success, and you need deal with 
the case skb != NULL but iov_iter_count(to) == 0 in tun_do_read().


Thanks


+
+out_free_skb:
+   if (skb)
+   kfree_skb(skb);
  out:
tun_put(tun);
return ret;




Re: [PATCH net v2 3/3] xfrm: Remove redundant state assignment in xfrm_input()

2017-11-30 Thread Steffen Klassert
On Tue, Nov 28, 2017 at 07:55:42PM +0200, av...@mellanox.com wrote:
> From: Aviv Heller 
> 
> x is already initialized to the same value, above.
> 
> Signed-off-by: Aviv Heller 
> Signed-off-by: Yevgeny Kliteynik  

Applied to ipsec-next, thanks!


Re: [PATCH net v2 1/3] xfrm: Fix xfrm_input() to verify state is valid when (encap_type < 0)

2017-11-30 Thread Steffen Klassert
On Tue, Nov 28, 2017 at 07:55:40PM +0200, av...@mellanox.com wrote:
> From: Aviv Heller 
> 
> Code path when (encap_type < 0) does not verify the state is valid
> before progressing.
> 
> This will result in a crash if, for instance, x->km.state ==
> XFRM_STATE_ACQ.
> 
> Fixes: 7785bba299a8 ("esp: Add a software GRO codepath")
> Signed-off-by: Aviv Heller 
> Signed-off-by: Yevgeny Kliteynik 

Good catch!

Patch applied, thanks Aviv!


Re: [PATCH net-next 3/3] xfrm: Add ESN support for IPSec HW offload

2017-11-30 Thread Steffen Klassert
On Tue, Nov 28, 2017 at 11:49:30AM +0200, yoss...@mellanox.com wrote:
> From: Yossef Efraim 
> 
> This patch adds ESN support to IPsec device offload.
> Adding new xfrm device operation to synchronize device ESN.
> 
> Signed-off-by: Yossef Efraim 
> ---
>  include/linux/netdevice.h |  1 +
>  include/net/xfrm.h| 12 
>  net/xfrm/xfrm_device.c|  4 ++--
>  net/xfrm/xfrm_replay.c|  2 ++
>  4 files changed, 17 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 7de7656..d4e9198 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -825,6 +825,7 @@ struct xfrmdev_ops {
>   void(*xdo_dev_state_free) (struct xfrm_state *x);
>   bool(*xdo_dev_offload_ok) (struct sk_buff *skb,
>  struct xfrm_state *x);
> + void(*xdo_dev_state_advance_esn) (struct xfrm_state *x);

We now have a documentation for the xfrm offloadin API in the
ipsec-next tree. Please document the new device operation
there and resubmit.

Thanks!


Re: [PATCH net-next 2/3] xfrm: Fix xfrm_dev_state_add to fail for unsupported HW SA option

2017-11-30 Thread Steffen Klassert
On Tue, Nov 28, 2017 at 11:49:29AM +0200, yoss...@mellanox.com wrote:
> From: Yossef Efraim 
> 
> xfrm_dev_state_add function returns success for unsupported HW SA options.
> Resulting the calling function to create SW SA without corrlating HW SA.
> Desipte IPSec device offloading option was chosen.
> These not supported HW SA options are hard coded within xfrm_dev_state_add
> function.
> SW backward compatibility will break if we add any of these option as old
> HW will fail with new SW.
> 
> This patch changes the behaviour to return -EINVAL in case unsupported
> option is chosen.
> Notifying user application regarding failure and not breaking backward
> compatibility for newly added HW SA options.
> 
> Signed-off-by: Yossef Efraim 

Also applied to ipsec-next, thanks a lot!



Re: [PATCH net-next 1/3] xfrm: Fix xfrm_replay_overflow_offload_esn

2017-11-30 Thread Steffen Klassert
On Tue, Nov 28, 2017 at 11:49:28AM +0200, yoss...@mellanox.com wrote:
> From: Yossef Efraim 
> 
> In case of wrap around, replay_esn->oseq_hi is not updated
> before it is tested for it's actual value, leading function
> to fail with overflow indication and packets being dropped.
> 
> This patch updates replay_esn->oseq_hi in the right place.
> 
> Fixes: d7dbefc45cf5 ("xfrm: Add xfrm_replay_overflow functions for 
> offloading")
> Signed-off-by: Yossef Efraim 

Applied to ipsec-next, thanks!


Re: [PATCH V11 0/5] hash addresses printed with %p

2017-11-30 Thread Sergey Senozhatsky
On (11/30/17 19:26), Sergey Senozhatsky wrote:
> On (11/30/17 10:23), David Laight wrote:
> [..]
> > > Maybe I'm being thick, but...  if we're rendering these addresses
> > > unusable by hashing them, why not just print something like
> > > "" in their place?  That loses the uniqueness thing but I
> > > wonder how valuable that is in practice?
> > 
> > My worry is that is you get a kernel 'oops' print with actual register
> > values you have no easy way of tying an address or address+offset to
> > the corresponding hash(address) printed elsewhere.
> 
> print the existing hash:pointer mappings in panic()? [if we can do that]

by this I meant
"when oops_in_progress == 1 then print hash:pointer for %p,
 not just hash".

-ss


Re: [PATCH ipsec] xfrm: fix XFRMA_OUTPUT_MARK policy entry

2017-11-30 Thread Steffen Klassert
On Wed, Nov 29, 2017 at 06:23:56PM +0100, Michal Kubecek wrote:
> This seems to be an obvious typo, NLA_U32 is type of the attribute, not its
> (minimal) length.
> 
> Fixes: 077fbac405bf ("net: xfrm: support setting an output mark.")
> Signed-off-by: Michal Kubecek 

Patch applied, thanks Michal!


[PATCH 3/3] tap: free skb if flags error

2017-11-30 Thread wexu
From: Wei Xu 

tap_recvmsg() supports accepting skb by msg_control after
commit 3b4ba04acca8 ("tap: support receiving skb from msg_control"),
the skb if presented should be freed within the function, otherwise
it would be leaked.

Signed-off-by: Wei Xu 
Reported-by: Matthew Rosato 
---
 drivers/net/tap.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index e9489b8..1c66b75 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1154,9 +1154,13 @@ static int tap_recvmsg(struct socket *sock, struct 
msghdr *m,
   size_t total_len, int flags)
 {
struct tap_queue *q = container_of(sock, struct tap_queue, sock);
+   struct sk_buff *skb = m->msg_control;
int ret;
-   if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
+   if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) {
+   if (skb)
+   kfree_skb(skb);
return -EINVAL;
+   }
ret = tap_do_read(q, >msg_iter, flags & MSG_DONTWAIT,
  m->msg_control);
if (ret > total_len) {
-- 
1.8.3.1



[PATCH 1/3] vhost: fix skb leak in handle_rx()

2017-11-30 Thread wexu
From: Wei Xu 

Matthew found a roughly 40% tcp throughput regression with commit
c67df11f(vhost_net: try batch dequing from skb array) as discussed
in the following thread:
https://www.mail-archive.com/netdev@vger.kernel.org/msg187936.html

Eventually we figured out that it was a skb leak in handle_rx()
when sending packets to the VM. This usually happens when a guest
can not drain out vq as fast as vhost fills in, afterwards it sets
off the traffic jam and leaks skb(s) which occurs as no headcount
to send on the vq from vhost side.

This can be avoided by making sure we have got enough headcount
before actually consuming a skb from the batched rx array while
transmitting, which is simply done by moving checking the zero
headcount a bit ahead.

Signed-off-by: Wei Xu 
Reported-by: Matthew Rosato 
---
 drivers/vhost/net.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8d626d7..c7bdeb6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -778,16 +778,6 @@ static void handle_rx(struct vhost_net *net)
/* On error, stop handling until the next kick. */
if (unlikely(headcount < 0))
goto out;
-   if (nvq->rx_array)
-   msg.msg_control = vhost_net_buf_consume(>rxq);
-   /* On overrun, truncate and discard */
-   if (unlikely(headcount > UIO_MAXIOV)) {
-   iov_iter_init(_iter, READ, vq->iov, 1, 1);
-   err = sock->ops->recvmsg(sock, ,
-1, MSG_DONTWAIT | MSG_TRUNC);
-   pr_debug("Discarded rx packet: len %zd\n", sock_len);
-   continue;
-   }
/* OK, now we need to know about added descriptors. */
if (!headcount) {
if (unlikely(vhost_enable_notify(>dev, vq))) {
@@ -800,6 +790,16 @@ static void handle_rx(struct vhost_net *net)
 * they refilled. */
goto out;
}
+   if (nvq->rx_array)
+   msg.msg_control = vhost_net_buf_consume(>rxq);
+   /* On overrun, truncate and discard */
+   if (unlikely(headcount > UIO_MAXIOV)) {
+   iov_iter_init(_iter, READ, vq->iov, 1, 1);
+   err = sock->ops->recvmsg(sock, ,
+1, MSG_DONTWAIT | MSG_TRUNC);
+   pr_debug("Discarded rx packet: len %zd\n", sock_len);
+   continue;
+   }
/* We don't need to be notified again. */
iov_iter_init(_iter, READ, vq->iov, in, vhost_len);
fixup = msg.msg_iter;
-- 
1.8.3.1



[PATCH net,stable v3] vhost: fix a few skb leaks

2017-11-30 Thread wexu
From: Wei Xu 

Matthew found a roughly 40% tcp throughput regression with commit
c67df11f(vhost_net: try batch dequing from skb array) as discussed
in the following thread:
https://www.mail-archive.com/netdev@vger.kernel.org/msg187936.html

This is v3.

v3:
- move freeing skb from vhost to tun/tap recvmsg() to not
  confuse the callers.

v2:
- add Matthew as the reporter, thanks matthew.
- moving zero headcount check ahead instead of defer consuming skb
  due to jason and mst's comment.
- add freeing skb in favor of recvmsg() fails.

Wei Xu (3):
  vhost: fix skb leak in handle_rx()
  tun: free skb in early errors
  tap: free skb if flags error

 drivers/net/tap.c   |  6 +-
 drivers/net/tun.c   | 14 +++---
 drivers/vhost/net.c | 20 ++--
 3 files changed, 26 insertions(+), 14 deletions(-)

-- 
1.8.3.1



[PATCH 2/3] tun: free skb in early errors

2017-11-30 Thread wexu
From: Wei Xu 

tun_recvmsg() supports accepting skb by msg_control after
commit ac77cfd4258f ("tun: support receiving skb through msg_control"),
the skb if presented should be freed within the function, otherwise it
would be leaked.

Signed-off-by: Wei Xu 
Reported-by: Matthew Rosato 
---
 drivers/net/tun.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 6a7bde9..5563430 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2067,14 +2067,17 @@ static int tun_recvmsg(struct socket *sock, struct 
msghdr *m, size_t total_len,
 {
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun = tun_get(tfile);
+   struct sk_buff *skb = m->msg_control;
int ret;
 
-   if (!tun)
-   return -EBADFD;
+   if (!tun) {
+   ret = -EBADFD;
+   goto out_free_skb;
+   }
 
if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
ret = -EINVAL;
-   goto out;
+   goto out_free_skb;
}
if (flags & MSG_ERRQUEUE) {
ret = sock_recv_errqueue(sock->sk, m, total_len,
@@ -2087,6 +2090,11 @@ static int tun_recvmsg(struct socket *sock, struct 
msghdr *m, size_t total_len,
m->msg_flags |= MSG_TRUNC;
ret = flags & MSG_TRUNC ? ret : total_len;
}
+   goto out;
+
+out_free_skb:
+   if (skb)
+   kfree_skb(skb);
 out:
tun_put(tun);
return ret;
-- 
1.8.3.1



Re: [PATCH ipsec] xfrm: add documentation for xfrm device offload api

2017-11-30 Thread Steffen Klassert
On Mon, Nov 20, 2017 at 02:26:07PM -0800, Shannon Nelson wrote:
> Add a writeup on how to use the XFRM device offload API, and
> mention this new file in the index.
> 
> Signed-off-by: Shannon Nelson 

Applied to ipsec-next, thanks a lot for this documentation!


Re: [PATCH ipsec-next] net: xfrm: allow clearing socket xfrm policies.

2017-11-30 Thread Steffen Klassert
On Mon, Nov 20, 2017 at 07:26:02PM +0900, Lorenzo Colitti wrote:
> Currently it is possible to add or update socket policies, but
> not clear them. Therefore, once a socket policy has been applied,
> the socket cannot be used for unencrypted traffic.
> 
> This patch allows (privileged) users to clear socket policies by
> passing in a NULL pointer and zero length argument to the
> {IP,IPV6}_{IPSEC,XFRM}_POLICY setsockopts. This results in both
> the incoming and outgoing policies being cleared.
> 
> The simple approach taken in this patch cannot clear socket
> policies in only one direction. If desired this could be added
> in the future, for example by continuing to pass in a length of
> zero (which currently is guaranteed to return EMSGSIZE) and
> making the policy be a pointer to an integer that contains one
> of the XFRM_POLICY_{IN,OUT} enum values.
> 
> An alternative would have been to interpret the length as a
> signed integer and use XFRM_POLICY_IN (i.e., 0) to clear the
> input policy and -XFRM_POLICY_OUT (i.e., -1) to clear the output
> policy.
> 
> Tested: https://android-review.googlesource.com/539816
> Signed-off-by: Lorenzo Colitti 

Applied to ipsec-next, thanks Lorenzo!


[PATCH net-next 08/13] nfp: bpf: correct the encoding for No-Dest immed

2017-11-30 Thread Jakub Kicinski
From: Jiong Wang 

When immed is used with No-Dest, the emitter should use reg.dst instead of
reg.areg for the destination, using the latter will actually encode
register zero.

Signed-off-by: Jiong Wang 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 54915a3b8a7e..024b44089623 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -224,9 +224,11 @@ emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm,
return;
}
 
-   __emit_immed(nfp_prog, reg.areg, reg.breg, imm >> 8, width,
-invert, shift, reg.wr_both,
-reg.dst_lmextn, reg.src_lmextn);
+   /* Use reg.dst when destination is No-Dest. */
+   __emit_immed(nfp_prog,
+swreg_type(dst) == NN_REG_NONE ? reg.dst : reg.areg,
+reg.breg, imm >> 8, width, invert, shift,
+reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
 }
 
 static void
-- 
2.15.0



[PATCH net-next 11/13] nfp: bpf: implement memory bulk copy for length within 32-bytes

2017-11-30 Thread Jakub Kicinski
From: Jiong Wang 

For NFP, we want to re-group a sequence of load/store pairs lowered from
memcpy/memmove into single memory bulk operation which then could be
accelerated using NFP CPP bus.

This patch extends the existing load/store auxiliary information by adding
two new fields:

struct bpf_insn *paired_st;
s16 ldst_gather_len;

Both fields are supposed to be carried by the the load instruction at the
head of the sequence. "paired_st" is the corresponding store instruction at
the head and "ldst_gather_len" is the gathered length.

If "ldst_gather_len" is negative, then the sequence is doing memory
load/store in descending order, otherwise it is in ascending order. We need
this information to detect overlapped memory access.

This patch then optimize memory bulk copy when the copy length is within
32-bytes.

The strategy of read/write used is:

  * Read.
Use read32 (direct_ref), always.

  * Write.
- length <= 8-bytes
  write8 (direct_ref).
- length <= 32-bytes and is 4-byte aligned
  write32 (direct_ref).
- length <= 32-bytes but is not 4-byte aligned
  write8 (indirect_ref).

NOTE: the optimization should not change program semantics. The destination
register of the last load instruction should contain the same value before
and after this optimization.

Signed-off-by: Jiong Wang 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 113 ++
 drivers/net/ethernet/netronome/nfp/bpf/main.h |   4 +
 drivers/net/ethernet/netronome/nfp/nfp_asm.c  |   1 +
 drivers/net/ethernet/netronome/nfp/nfp_asm.h  |   4 +
 4 files changed, 122 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index da4e106d3b16..138568c0eee6 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -154,6 +154,13 @@ emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, 
u8 mode, u8 xfer,
emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, false);
 }
 
+static void
+emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 
xfer,
+  swreg lreg, swreg rreg, u8 size, bool sync)
+{
+   emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, true);
+}
+
 static void
 __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
  enum br_ctx_signal_state css, u16 addr, u8 defer)
@@ -515,6 +522,109 @@ static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 
dst, u16 src)
wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
 }
 
+/* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
+ * result to @dst from low end.
+ */
+static void
+wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
+   u8 offset)
+{
+   enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
+   u8 mask = (1 << field_len) - 1;
+
+   emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
+}
+
+/* NFP has Command Push Pull bus which supports bluk memory operations. */
+static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta 
*meta)
+{
+   bool descending_seq = meta->ldst_gather_len < 0;
+   s16 len = abs(meta->ldst_gather_len);
+   swreg src_base, off;
+   unsigned int i;
+   u8 xfer_num;
+
+   if (WARN_ON_ONCE(len > 32))
+   return -EOPNOTSUPP;
+
+   off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
+   src_base = reg_a(meta->insn.src_reg * 2);
+   xfer_num = round_up(len, 4) / 4;
+
+   /* Memory read from source addr into transfer-in registers. */
+   emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base, off,
+xfer_num - 1, true);
+
+   /* Move from transfer-in to transfer-out. */
+   for (i = 0; i < xfer_num; i++)
+   wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
+
+   off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
+
+   if (len <= 8) {
+   /* Use single direct_ref write8. */
+   emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
+reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
+true);
+   } else if (IS_ALIGNED(len, 4)) {
+   /* Use single direct_ref write32. */
+   emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
+reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
+true);
+   } else {
+   /* Use single indirect_ref write8. */
+   wrp_immed(nfp_prog, reg_none(),
+ CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
+   emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
+  

[PATCH net-next 06/13] nfp: bpf: don't do ld/shifts combination if shifts are jump destination

2017-11-30 Thread Jakub Kicinski
From: Jiong Wang 

If any of the shift insns in the ld/shift sequence is jump destination,
don't do combination.

Signed-off-by: Jiong Wang 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index f2317b764222..54915a3b8a7e 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -2181,6 +2181,10 @@ static void nfp_bpf_opt_ld_shift(struct nfp_prog 
*nfp_prog)
if (next1.imm != 0x20 || next2.imm != 0x20)
continue;
 
+   if (meta2->flags & FLAG_INSN_IS_JUMP_DST ||
+   meta3->flags & FLAG_INSN_IS_JUMP_DST)
+   continue;
+
meta2->skip = true;
meta3->skip = true;
}
-- 
2.15.0



[PATCH net-next 12/13] nfp: bpf: implement memory bulk copy for length bigger than 32-bytes

2017-11-30 Thread Jakub Kicinski
From: Jiong Wang 

When the gathered copy length is bigger than 32-bytes and within 128-bytes
(the maximum length a single CPP Pull/Push request can finish), the
strategy of read/write are changeed into:

  * Read.
  - use direct reference mode when length is within 32-bytes.
  - use indirect mode when length is bigger than 32-bytes.

  * Write.
  - length <= 8-bytes
use write8 (direct_ref).
  - length <= 32-byte and 4-bytes aligned
use write32 (direct_ref).
  - length <= 32-bytes but not 4-bytes aligned
use write8 (indirect_ref).
  - length > 32-bytes and 4-bytes aligned
use write32 (indirect_ref).
  - length > 32-bytes and not 4-bytes aligned and <= 40-bytes
use write32 (direct_ref) to finish the first 32-bytes.
use write8 (direct_ref) to finish all remaining hanging part.
  - length > 32-bytes and not 4-bytes aligned
use write32 (indirect_ref) to finish those 4-byte aligned parts.
use write8 (direct_ref) to finish all remaining hanging part.

Signed-off-by: Jiong Wang 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 52 
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 138568c0eee6..1b98ef239605 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -544,16 +544,18 @@ static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, 
struct nfp_insn_meta *meta)
unsigned int i;
u8 xfer_num;
 
-   if (WARN_ON_ONCE(len > 32))
-   return -EOPNOTSUPP;
-
off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
src_base = reg_a(meta->insn.src_reg * 2);
xfer_num = round_up(len, 4) / 4;
 
+   /* Setup PREV_ALU fields to override memory read length. */
+   if (len > 32)
+   wrp_immed(nfp_prog, reg_none(),
+ CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
+
/* Memory read from source addr into transfer-in registers. */
-   emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base, off,
-xfer_num - 1, true);
+   emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base,
+off, xfer_num - 1, true, len > 32);
 
/* Move from transfer-in to transfer-out. */
for (i = 0; i < xfer_num; i++)
@@ -566,18 +568,54 @@ static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, 
struct nfp_insn_meta *meta)
emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
 reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
 true);
-   } else if (IS_ALIGNED(len, 4)) {
+   } else if (len <= 32 && IS_ALIGNED(len, 4)) {
/* Use single direct_ref write32. */
emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
 reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
 true);
-   } else {
+   } else if (len <= 32) {
/* Use single indirect_ref write8. */
wrp_immed(nfp_prog, reg_none(),
  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
   reg_a(meta->paired_st->dst_reg * 2), off,
   len - 1, true);
+   } else if (IS_ALIGNED(len, 4)) {
+   /* Use single indirect_ref write32. */
+   wrp_immed(nfp_prog, reg_none(),
+ CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
+   emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
+  reg_a(meta->paired_st->dst_reg * 2), off,
+  xfer_num - 1, true);
+   } else if (len <= 40) {
+   /* Use one direct_ref write32 to write the first 32-bytes, then
+* another direct_ref write8 to write the remaining bytes.
+*/
+   emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
+reg_a(meta->paired_st->dst_reg * 2), off, 7,
+true);
+
+   off = re_load_imm_any(nfp_prog, meta->paired_st->off + 32,
+ imm_b(nfp_prog));
+   emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 8,
+reg_a(meta->paired_st->dst_reg * 2), off, len - 33,
+true);
+   } else {
+   /* Use one indirect_ref write32 to write 4-bytes aligned length,
+* then another direct_ref write8 to write the remaining bytes.
+*/
+   u8 

[PATCH net-next 09/13] nfp: bpf: encode indirect commands

2017-11-30 Thread Jakub Kicinski
Add support for emitting commands with field overwrites.

Signed-off-by: Jakub Kicinski 
Signed-off-by: Jiong Wang 
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 17 +
 drivers/net/ethernet/netronome/nfp/nfp_asm.h |  3 ++-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 024b44089623..da4e106d3b16 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -96,7 +96,7 @@ nfp_prog_offset_to_index(struct nfp_prog *nfp_prog, unsigned 
int offset)
 /* --- Emitters --- */
 static void
 __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
-  u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, bool sync)
+  u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, bool sync, bool indir)
 {
enum cmd_ctx_swap ctx;
u64 insn;
@@ -114,14 +114,15 @@ __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
FIELD_PREP(OP_CMD_CNT, size) |
FIELD_PREP(OP_CMD_SIG, sync) |
FIELD_PREP(OP_CMD_TGT_CMD, cmd_tgt_act[op].tgt_cmd) |
+   FIELD_PREP(OP_CMD_INDIR, indir) |
FIELD_PREP(OP_CMD_MODE, mode);
 
nfp_prog_push(nfp_prog, insn);
 }
 
 static void
-emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
-u8 mode, u8 xfer, swreg lreg, swreg rreg, u8 size, bool sync)
+emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
+swreg lreg, swreg rreg, u8 size, bool sync, bool indir)
 {
struct nfp_insn_re_regs reg;
int err;
@@ -142,7 +143,15 @@ emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
return;
}
 
-   __emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, sync);
+   __emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, sync,
+  indir);
+}
+
+static void
+emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
+swreg lreg, swreg rreg, u8 size, bool sync)
+{
+   emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, false);
 }
 
 static void
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h 
b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
index 74d0c11ab2f9..6ff842a15e5d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016 Netronome Systems, Inc.
+ * Copyright (C) 2016-2017 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -209,6 +209,7 @@ enum alu_dst_ab {
 #define OP_CMD_CNT 0xe00ULL
 #define OP_CMD_SIG 0x000f000ULL
 #define OP_CMD_TGT_CMD 0x07fULL
+#define OP_CMD_INDIR   0x200ULL
 #define OP_CMD_MODE   0x1c00ULL
 
 struct cmd_tgt_act {
-- 
2.15.0



[PATCH net-next 03/13] nfp: bpf: record jump destination to simplify jump fixup

2017-11-30 Thread Jakub Kicinski
From: Jiong Wang 

eBPF insns are internally organized as dual-list inside NFP offload JIT.
Random access to an insn needs to be done by either forward or backward
traversal along the list.

One place we need to do such traversal is at nfp_fixup_branches where one
traversal is needed for each jump insn to find the destination. Such
traversals could be avoided if jump destinations are collected through a
single travesal in a pre-scan pass, and such information could also be
useful in other places where jump destination info are needed.

This patch adds such jump destination collection in nfp_prog_prepare.

Suggested-by: Jakub Kicinski 
Signed-off-by: Jiong Wang 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 57 ---
 drivers/net/ethernet/netronome/nfp/bpf/main.h | 13 +-
 drivers/net/ethernet/netronome/nfp/bpf/offload.c  | 22 +++--
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c |  4 +-
 4 files changed, 41 insertions(+), 55 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 20daf6b95601..f76659ecb654 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -65,12 +65,6 @@
 next = nfp_meta_next(pos), \
 next2 = nfp_meta_next(next))
 
-static bool
-nfp_meta_has_next(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-   return meta->l.next != _prog->insns;
-}
-
 static bool
 nfp_meta_has_prev(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
@@ -1864,9 +1858,8 @@ static void br_set_offset(u64 *instr, u16 offset)
 /* --- Assembler logic --- */
 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 {
-   struct nfp_insn_meta *meta, *next;
+   struct nfp_insn_meta *meta, *jmp_dst;
u32 idx, br_idx;
-   int off;
 
list_for_each_entry(meta, _prog->insns, l) {
if (meta->skip)
@@ -1874,13 +1867,10 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
if (BPF_CLASS(meta->insn.code) != BPF_JMP)
continue;
 
-   if (list_is_last(>l, _prog->insns)) {
-   next = NULL;
+   if (list_is_last(>l, _prog->insns))
idx = nfp_prog->last_bpf_off;
-   } else {
-   next = list_next_entry(meta, l);
-   idx = next->off - 1;
-   }
+   else
+   idx = list_next_entry(meta, l)->off - 1;
 
br_idx = nfp_prog_offset_to_index(nfp_prog, idx);
 
@@ -1893,43 +1883,14 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
if (FIELD_GET(OP_BR_SPECIAL, nfp_prog->prog[br_idx]))
continue;
 
-   /* Find the target offset in assembler realm */
-   off = meta->insn.off;
-   if (!off) {
-   pr_err("Fixup found zero offset!!\n");
+   if (!meta->jmp_dst) {
+   pr_err("Non-exit jump doesn't have destination info 
recorded!!\n");
return -ELOOP;
}
 
-   if (!next) {
-   /* When "next" is NULL, "meta" is the last node in the
-* list. Given it is an JMP, it then must be a backward
-* jump.
-*
-* For eBPF, the jump offset is against pc + 1, so we
-* need to compensate the offset by 1 as we are pointing
-* "next" to the current node "meta".
-*/
-   if (WARN_ON_ONCE(off > -2))
-   return -ELOOP;
-
-   next = meta;
-   off += 1;
-   }
-
-   while (off > 0 && nfp_meta_has_next(nfp_prog, next)) {
-   next = nfp_meta_next(next);
-   off--;
-   }
-   while (off < 0 && nfp_meta_has_prev(nfp_prog, next)) {
-   next = nfp_meta_prev(next);
-   off++;
-   }
-   if (off) {
-   pr_err("Fixup found too large jump!! %d\n", off);
-   return -ELOOP;
-   }
+   jmp_dst = meta->jmp_dst;
 
-   if (next->skip) {
+   if (jmp_dst->skip) {
pr_err("Branch landing on removed instruction!!\n");
return -ELOOP;
}
@@ -1938,7 +1899,7 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 idx <= br_idx; idx++) {
if (!nfp_is_br(nfp_prog->prog[idx]))
  

[PATCH net-next 07/13] nfp: bpf: relax source operands check

2017-11-30 Thread Jakub Kicinski
From: Jiong Wang 

The NFP normally requires the source operands to be difference addressing
modes, but we should rule out the very special NN_REG_NONE type.

There are instruction that ignores both A/B operands, for example:

  local_csr_rd

For these instructions, we might pass the same operand type, NN_REG_NONE,
for both A/B operands.

NOTE: in current NFP ISA, it is only possible for instructions with
unrestricted operands to take none operands, but in case there is new and
similar instructoin in restricted form, they would follow similar rules,
so swreg_to_restricted is updated as well.

Signed-off-by: Jiong Wang 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_asm.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c 
b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
index 830f6de25f47..da277386077c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
@@ -120,7 +120,8 @@ int swreg_to_unrestricted(swreg dst, swreg lreg, swreg rreg,
reg->dst = nfp_swreg_to_unreg(dst, true);
 
/* Decode source operands */
-   if (swreg_type(lreg) == swreg_type(rreg))
+   if (swreg_type(lreg) == swreg_type(rreg) &&
+   swreg_type(lreg) != NN_REG_NONE)
return -EFAULT;
 
if (swreg_type(lreg) == NN_REG_GPR_B ||
@@ -200,7 +201,8 @@ int swreg_to_restricted(swreg dst, swreg lreg, swreg rreg,
reg->dst = nfp_swreg_to_rereg(dst, true, false, NULL);
 
/* Decode source operands */
-   if (swreg_type(lreg) == swreg_type(rreg))
+   if (swreg_type(lreg) == swreg_type(rreg) &&
+   swreg_type(lreg) != NN_REG_NONE)
return -EFAULT;
 
if (swreg_type(lreg) == NN_REG_GPR_B ||
-- 
2.15.0



[PATCH net-next 04/13] nfp: bpf: flag jump destination to guide insn combine optimizations

2017-11-30 Thread Jakub Kicinski
From: Jiong Wang 

NFP eBPF offload JIT engine is doing some instruction combine based
optimizations which however must not be safe if the combined sequences
are across basic block boarders.

Currently, there are post checks during fixing jump destinations. If the
jump destination is found to be eBPF insn that has been combined into
another one, then JIT engine will raise error and abort.

This is not optimal. The JIT engine ought to disable the optimization on
such cross-bb-border sequences instead of abort.

As there is no control flow information in eBPF infrastructure that we
can't do basic block based optimizations, this patch extends the existing
jump destination record pass to also flag the jump destination, then in
instruction combine passes we could skip the optimizations if insns in the
sequence are jump targets.

Suggested-by: Jakub Kicinski 
Signed-off-by: Jiong Wang 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/bpf/main.h| 4 
 drivers/net/ethernet/netronome/nfp/bpf/offload.c | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h 
b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index e488656f406c..99da1d34dd0e 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -89,6 +89,8 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct 
nfp_insn_meta *);
 #define nfp_meta_next(meta)list_next_entry(meta, l)
 #define nfp_meta_prev(meta)list_prev_entry(meta, l)
 
+#define FLAG_INSN_IS_JUMP_DST  BIT(0)
+
 /**
  * struct nfp_insn_meta - BPF instruction wrapper
  * @insn: BPF instruction
@@ -97,6 +99,7 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct 
nfp_insn_meta *);
  * @jmp_dst: destination info for jump instructions
  * @off: index of first generated machine instruction (in nfp_prog.prog)
  * @n: eBPF instruction number
+ * @flags: eBPF instruction extra optimization flags
  * @skip: skip this instruction (optimized out)
  * @double_cb: callback for second part of the instruction
  * @l: link on nfp_prog->insns list
@@ -112,6 +115,7 @@ struct nfp_insn_meta {
};
unsigned int off;
unsigned short n;
+   unsigned short flags;
bool skip;
instr_cb_t double_cb;
 
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c 
b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 240db663d83f..377976ce92dd 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -83,6 +83,7 @@ nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct 
bpf_insn *prog,
 cnt);
 
meta->jmp_dst = dst_meta;
+   dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
}
}
 
-- 
2.15.0



[PATCH net-next 01/13] nfp: fix old kdoc issues

2017-11-30 Thread Jakub Kicinski
Since commit 3a025e1d1c2e ("Add optional check for bad kernel-doc
comments") when built with W=1 build will complain about kdoc errors.
Fix the kdoc issues we have.  kdoc is still confused by defines in
nfp_net_ctrl.h but those are not really errors.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h | 2 ++
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c | 9 +++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h 
b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 7f9857c276b1..3801c52098d5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -548,6 +548,8 @@ struct nfp_net_dp {
  * @max_r_vecs:Number of allocated interrupt vectors for RX/TX
  * @max_tx_rings:   Maximum number of TX rings supported by the Firmware
  * @max_rx_rings:   Maximum number of RX rings supported by the Firmware
+ * @stride_rx: Queue controller RX queue spacing
+ * @stride_tx: Queue controller TX queue spacing
  * @r_vecs: Pre-allocated array of ring vectors
  * @irq_entries:Pre-allocated array of MSI-X entries
  * @lsc_handler:Handler for Link State Change interrupt
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c 
b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
index 04dd5758ecf5..3fcb522d2e85 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
@@ -372,8 +372,7 @@ nfp_cpp_area_alloc(struct nfp_cpp *cpp, u32 dest,
  * that it can be accessed directly.
  *
  * NOTE: @address and @size must be 32-bit aligned values.
- *
- * NOTE: The area must also be 'released' when the structure is freed.
+ * The area must also be 'released' when the structure is freed.
  *
  * Return: NFP CPP Area handle, or NULL
  */
@@ -536,8 +535,7 @@ void nfp_cpp_area_release_free(struct nfp_cpp_area *area)
  * Read data from indicated CPP region.
  *
  * NOTE: @offset and @length must be 32-bit aligned values.
- *
- * NOTE: Area must have been locked down with an 'acquire'.
+ * Area must have been locked down with an 'acquire'.
  *
  * Return: length of io, or -ERRNO
  */
@@ -558,8 +556,7 @@ int nfp_cpp_area_read(struct nfp_cpp_area *area,
  * Write data to indicated CPP region.
  *
  * NOTE: @offset and @length must be 32-bit aligned values.
- *
- * NOTE: Area must have been locked down with an 'acquire'.
+ * Area must have been locked down with an 'acquire'.
  *
  * Return: length of io, or -ERRNO
  */
-- 
2.15.0



[PATCH net-next 10/13] nfp: bpf: factor out is_mbpf_load & is_mbpf_store

2017-11-30 Thread Jakub Kicinski
From: Jiong Wang 

It is usual that we need to check if one BPF insn is for loading/storeing
data from/to memory.

Therefore, it makes sense to factor out related code to become common
helper functions.

Signed-off-by: Jiong Wang 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/bpf/main.h | 10 ++
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c |  4 ++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h 
b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 99da1d34dd0e..20ef0adb2931 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -144,6 +144,16 @@ static inline u8 mbpf_mode(const struct nfp_insn_meta 
*meta)
return BPF_MODE(meta->insn.code);
 }
 
+static inline bool is_mbpf_load(const struct nfp_insn_meta *meta)
+{
+   return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_LDX | BPF_MEM);
+}
+
+static inline bool is_mbpf_store(const struct nfp_insn_meta *meta)
+{
+   return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_MEM);
+}
+
 /**
  * struct nfp_prog - nfp BPF program
  * @prog: machine code
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c 
b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index cca67730b91f..d2bf29c90226 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -180,10 +180,10 @@ nfp_verify_insn(struct bpf_verifier_env *env, int 
insn_idx, int prev_insn_idx)
if (meta->insn.code == (BPF_JMP | BPF_EXIT))
return nfp_bpf_check_exit(nfp_prog, env);
 
-   if ((meta->insn.code & ~BPF_SIZE_MASK) == (BPF_LDX | BPF_MEM))
+   if (is_mbpf_load(meta))
return nfp_bpf_check_ptr(nfp_prog, meta, env,
 meta->insn.src_reg);
-   if ((meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_MEM))
+   if (is_mbpf_store(meta))
return nfp_bpf_check_ptr(nfp_prog, meta, env,
 meta->insn.dst_reg);
 
-- 
2.15.0



[PATCH net-next 05/13] nfp: bpf: don't do ld/mask combination if mask is jump destination

2017-11-30 Thread Jakub Kicinski
From: Jiong Wang 

If the mask insn in the ld/mask pair is jump destination, then don't do
combination.

Signed-off-by: Jiong Wang 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index f76659ecb654..f2317b764222 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -2142,6 +2142,9 @@ static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
if (next.src_reg || next.dst_reg)
continue;
 
+   if (meta2->flags & FLAG_INSN_IS_JUMP_DST)
+   continue;
+
meta2->skip = true;
}
 }
-- 
2.15.0



[PATCH net-next 13/13] nfp: bpf: detect load/store sequences lowered from memory copy

2017-11-30 Thread Jakub Kicinski
From: Jiong Wang 

This patch add the optimization frontend, but adding a new eBPF IR scan
pass "nfp_bpf_opt_ldst_gather".

The pass will traverse the IR to recognize the load/store pairs sequences
that come from lowering of memory copy builtins.

The gathered memory copy information will be kept in the meta info
structure of the first load instruction in the sequence and will be
consumed by the optimization backend added in the previous patches.

NOTE: a sequence with cross memory access doesn't qualify this
optimization, i.e. if one load in the sequence will load from place that
has been written by previous store. This is because when we turn the
sequence into single CPP operation, we are reading all contents at once
into NFP transfer registers, then write them out as a whole. This is not
identical with what the original load/store sequence is doing.

Detecting cross memory access for two random pointers will be difficult,
fortunately under XDP/eBPF's restrictied runtime environment, the copy
normally happen among map, packet data and stack, they do not overlap with
each other.

And for cases supported by NFP, cross memory access will only happen on
PTR_TO_PACKET. Fortunately for this, there is ID information that we could
do accurate memory alias check.

Signed-off-by: Jiong Wang 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 237 +++
 1 file changed, 237 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 1b98ef239605..3419ad495962 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -2352,12 +2352,249 @@ static void nfp_bpf_opt_ld_shift(struct nfp_prog 
*nfp_prog)
}
 }
 
+/* load/store pair that forms memory copy sould look like the following:
+ *
+ *   ld_width R, [addr_src + offset_src]
+ *   st_width [addr_dest + offset_dest], R
+ *
+ * The destination register of load and source register of store should
+ * be the same, load and store should also perform at the same width.
+ * If either of addr_src or addr_dest is stack pointer, we don't do the
+ * CPP optimization as stack is modelled by registers on NFP.
+ */
+static bool
+curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
+   struct nfp_insn_meta *st_meta)
+{
+   struct bpf_insn *ld = _meta->insn;
+   struct bpf_insn *st = _meta->insn;
+
+   if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
+   return false;
+
+   if (ld_meta->ptr.type != PTR_TO_PACKET)
+   return false;
+
+   if (st_meta->ptr.type != PTR_TO_PACKET)
+   return false;
+
+   if (BPF_SIZE(ld->code) != BPF_SIZE(st->code))
+   return false;
+
+   if (ld->dst_reg != st->src_reg)
+   return false;
+
+   /* There is jump to the store insn in this pair. */
+   if (st_meta->flags & FLAG_INSN_IS_JUMP_DST)
+   return false;
+
+   return true;
+}
+
+/* Currently, we only support chaining load/store pairs if:
+ *
+ *  - Their address base registers are the same.
+ *  - Their address offsets are in the same order.
+ *  - They operate at the same memory width.
+ *  - There is no jump into the middle of them.
+ */
+static bool
+curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta,
+ struct nfp_insn_meta *st_meta,
+ struct bpf_insn *prev_ld,
+ struct bpf_insn *prev_st)
+{
+   u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst;
+   struct bpf_insn *ld = _meta->insn;
+   struct bpf_insn *st = _meta->insn;
+   s16 prev_ld_off, prev_st_off;
+
+   /* This pair is the start pair. */
+   if (!prev_ld)
+   return true;
+
+   prev_size = BPF_LDST_BYTES(prev_ld);
+   curr_size = BPF_LDST_BYTES(ld);
+   prev_ld_base = prev_ld->src_reg;
+   prev_st_base = prev_st->dst_reg;
+   prev_ld_dst = prev_ld->dst_reg;
+   prev_ld_off = prev_ld->off;
+   prev_st_off = prev_st->off;
+
+   if (ld->dst_reg != prev_ld_dst)
+   return false;
+
+   if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base)
+   return false;
+
+   if (curr_size != prev_size)
+   return false;
+
+   /* There is jump to the head of this pair. */
+   if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST)
+   return false;
+
+   /* Both in ascending order. */
+   if (prev_ld_off + prev_size == ld->off &&
+   prev_st_off + prev_size == st->off)
+   return true;
+
+   /* Both in descending order. */
+   if (ld->off + curr_size == prev_ld_off &&
+   st->off + curr_size == prev_st_off)
+   return true;
+
+   return false;
+}
+
+/* Return TRUE if cross 

[PATCH net-next 00/13] nfp: bpf: jump resolution and memcpy update

2017-11-30 Thread Jakub Kicinski
Hi!

Jiong says:

Currently, compiler will lower memcpy function call in XDP/eBPF C program
into a sequence of eBPF load/store pairs for some scenarios.

Compiler is thinking this "inline" optimiation is beneficial as it could
avoid function call and also increase code locality.

However, Netronome NPU is not an tranditional load/store architecture that
doing a sequence of individual load/store actions are not efficient.

This patch set tries to identify the load/store sequences composed of
load/store pairs that comes from memcpy lowering, then accelerates them
through NPU's Command Push Pull (CPP) instruction.

This patch set registered an new optimization pass before doing the actual
JIT work, it traverse through eBPF IR, once found candidate sequence then
record the memory copy source, destination and length information in the
first load instruction starting the sequence and marks all remaining
instructions in the sequence into skipable status. Later, when JITing the
first load instructoin, optimal instructions will be generated using those
record information.

For this safety of this transformation:

  - jump into the middle of the sequence will cancel the optimization.

  - overlapped memory access will cancel the optimization.

  - the load destination register still contains the same value as before
the transformation.


Jakub Kicinski (2):
  nfp: fix old kdoc issues
  nfp: bpf: encode indirect commands

Jiong Wang (11):
  nfp: bpf: support backward jump
  nfp: bpf: record jump destination to simplify jump fixup
  nfp: bpf: flag jump destination to guide insn combine optimizations
  nfp: bpf: don't do ld/mask combination if mask is jump destination
  nfp: bpf: don't do ld/shifts combination if shifts are jump
destination
  nfp: bpf: relax source operands check
  nfp: bpf: correct the encoding for No-Dest immed
  nfp: bpf: factor out is_mbpf_load & is_mbpf_store
  nfp: bpf: implement memory bulk copy for length within 32-bytes
  nfp: bpf: implement memory bulk copy for length bigger than 32-bytes
  nfp: bpf: detect load/store sequences lowered from memory copy

 drivers/net/ethernet/netronome/nfp/bpf/jit.c   | 489 ++---
 drivers/net/ethernet/netronome/nfp/bpf/main.h  |  35 +-
 drivers/net/ethernet/netronome/nfp/bpf/offload.c   |  23 +-
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c  |   8 +-
 drivers/net/ethernet/netronome/nfp/nfp_asm.c   |   7 +-
 drivers/net/ethernet/netronome/nfp/nfp_asm.h   |   7 +-
 drivers/net/ethernet/netronome/nfp/nfp_net.h   |   2 +
 .../ethernet/netronome/nfp/nfpcore/nfp_cppcore.c   |   9 +-
 8 files changed, 505 insertions(+), 75 deletions(-)

-- 
2.15.0



[PATCH net-next 02/13] nfp: bpf: support backward jump

2017-11-30 Thread Jakub Kicinski
From: Jiong Wang 

This patch adds support for backward jump on NFP.

  - restrictions on backward jump in various functions have been removed.
  - nfp_fixup_branches now supports backward jump.

There is one thing to note, currently an input eBPF JMP insn may generate
several NFP insns, for example,

  NFP imm move insn A \
  NFP compare insn  B  --> 3 NFP insn jited from eBPF JMP insn M
  NFP branch insn   C /
  ---
  NFP insn X   --> 1 NFP insn jited from eBPF insn N
  ---
  ...

therefore, we are doing sanity check to make sure the last jited insn from
an eBPF JMP is a NFP branch instruction.

Once backward jump is allowed, it is possible an eBPF JMP insn is at the
end of the program. This is however causing trouble for the sanity check.
Because the sanity check requires the end index of the NFP insns jited from
one eBPF insn while only the start index is recorded before this patch that
we can only get the end index by:

  start_index_of_the_next_eBPF_insn - 1

or for the above example:

  start_index_of_eBPF_insn_N (which is the index of NFP insn X) - 1

nfp_fixup_branches was using nfp_for_each_insn_walk2 to expose *next* insn
to each iteration during the traversal so the last index could be
calculated from which. Now, it needs some extra code to handle the last
insn. Meanwhile, the use of walk2 is actually unnecessary, we could simply
use generic single instruction walk to do this, the next insn could be
easily calculated using list_next_entry.

So, this patch migrates the jump fixup traversal method to
*list_for_each_entry*, this simplifies the code logic a little bit.

The other thing to note is a new state variable "last_bpf_off" is
introduced to track the index of the last jited NFP insn. This is necessary
because NFP is generating special purposes epilogue sequences, so the index
of the last jited NFP insn is *not* always nfp_prog->prog_len - 1.

Suggested-by: Jakub Kicinski 
Signed-off-by: Jiong Wang 
Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 66 +++
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  4 +-
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 995e95410b11..20daf6b95601 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016 Netronome Systems, Inc.
+ * Copyright (C) 2016-2017 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -975,9 +975,6 @@ wrp_test_reg(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta,
 {
const struct bpf_insn *insn = >insn;
 
-   if (insn->off < 0) /* TODO */
-   return -EOPNOTSUPP;
-
wrp_test_reg_one(nfp_prog, insn->dst_reg * 2, alu_op,
 insn->src_reg * 2, br_mask, insn->off);
wrp_test_reg_one(nfp_prog, insn->dst_reg * 2 + 1, alu_op,
@@ -995,9 +992,6 @@ wrp_cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta 
*meta,
u8 reg = insn->dst_reg * 2;
swreg tmp_reg;
 
-   if (insn->off < 0) /* TODO */
-   return -EOPNOTSUPP;
-
tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
if (!swap)
emit_alu(nfp_prog, reg_none(), reg_a(reg), ALU_OP_SUB, tmp_reg);
@@ -1027,9 +1021,6 @@ wrp_cmp_reg(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta,
areg = insn->dst_reg * 2;
breg = insn->src_reg * 2;
 
-   if (insn->off < 0) /* TODO */
-   return -EOPNOTSUPP;
-
if (swap) {
areg ^= breg;
breg ^= areg;
@@ -1630,8 +1621,6 @@ static int mem_stx8(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
 
 static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
-   if (meta->insn.off < 0) /* TODO */
-   return -EOPNOTSUPP;
emit_br(nfp_prog, BR_UNC, meta->insn.off, 0);
 
return 0;
@@ -1646,9 +1635,6 @@ static int jeq_imm(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
or1 = reg_a(insn->dst_reg * 2);
or2 = reg_b(insn->dst_reg * 2 + 1);
 
-   if (insn->off < 0) /* TODO */
-   return -EOPNOTSUPP;
-
if (imm & ~0U) {
tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
emit_alu(nfp_prog, imm_a(nfp_prog),
@@ -1695,9 +1681,6 @@ static int jset_imm(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
u64 imm = insn->imm; /* sign extend */
swreg tmp_reg;
 
-   if (insn->off < 0) /* TODO */
-   return -EOPNOTSUPP;
-
if (!imm) {
meta->skip = true;
   

[PATCH net-next 3/7] bpf: don't mark FP reg as uninit

2017-11-30 Thread Alexei Starovoitov
when verifier hits an internal bug don't mark register R10==FP as uninit,
since it's read only register and it's not technically correct to let
verifier run further, since it may assume that R10 has valid auxiliary state.

While developing subsequent patches this issue was discovered,
though the code eventually changed that aux reg state doesn't have
pointers any more it is still safer to avoid clearing readonly register.

Signed-off-by: Alexei Starovoitov 
Acked-by: John Fastabend 
Acked-by: Daniel Borkmann 
---
 kernel/bpf/verifier.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f7229390c279..14ad7c6e806a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -584,8 +584,8 @@ static void mark_reg_unknown(struct bpf_verifier_env *env,
 {
if (WARN_ON(regno >= MAX_BPF_REG)) {
verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
-   /* Something bad happened, let's kill all regs */
-   for (regno = 0; regno < MAX_BPF_REG; regno++)
+   /* Something bad happened, let's kill all regs except FP */
+   for (regno = 0; regno < BPF_REG_FP; regno++)
__mark_reg_not_init(regs + regno);
return;
}
@@ -603,8 +603,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
 {
if (WARN_ON(regno >= MAX_BPF_REG)) {
verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
-   /* Something bad happened, let's kill all regs */
-   for (regno = 0; regno < MAX_BPF_REG; regno++)
+   /* Something bad happened, let's kill all regs except FP */
+   for (regno = 0; regno < BPF_REG_FP; regno++)
__mark_reg_not_init(regs + regno);
return;
}
-- 
2.9.5



[PATCH net-next 6/7] bpf: cleanup register_is_null()

2017-11-30 Thread Alexei Starovoitov
don't pass large struct bpf_reg_state by value.
Instead pass it by pointer.

Signed-off-by: Alexei Starovoitov 
Acked-by: John Fastabend 
Acked-by: Daniel Borkmann 
---
 kernel/bpf/verifier.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index afe9a1a0a5fe..7afa92e9b409 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1265,9 +1265,9 @@ static int check_xadd(struct bpf_verifier_env *env, int 
insn_idx, struct bpf_ins
 }
 
 /* Does this register contain a constant zero? */
-static bool register_is_null(struct bpf_reg_state reg)
+static bool register_is_null(struct bpf_reg_state *reg)
 {
-   return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0);
+   return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
 }
 
 /* when register 'regno' is passed into function that will read 'access_size'
@@ -1280,31 +1280,31 @@ static int check_stack_boundary(struct bpf_verifier_env 
*env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
 {
+   struct bpf_reg_state *reg = cur_regs(env) + regno;
struct bpf_verifier_state *state = env->cur_state;
-   struct bpf_reg_state *regs = state->regs;
int off, i, slot, spi;
 
-   if (regs[regno].type != PTR_TO_STACK) {
+   if (reg->type != PTR_TO_STACK) {
/* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
-   register_is_null(regs[regno]))
+   register_is_null(reg))
return 0;
 
verbose(env, "R%d type=%s expected=%s\n", regno,
-   reg_type_str[regs[regno].type],
+   reg_type_str[reg->type],
reg_type_str[PTR_TO_STACK]);
return -EACCES;
}
 
/* Only allow fixed-offset stack reads */
-   if (!tnum_is_const(regs[regno].var_off)) {
+   if (!tnum_is_const(reg->var_off)) {
char tn_buf[48];
 
-   tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
+   tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env, "invalid variable stack read R%d var_off=%s\n",
regno, tn_buf);
}
-   off = regs[regno].off + regs[regno].var_off.value;
+   off = reg->off + reg->var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
@@ -1412,7 +1412,7 @@ static int check_func_arg(struct bpf_verifier_env *env, 
u32 regno,
 * passed in as argument, it's a SCALAR_VALUE type. Final test
 * happens during stack boundary checking.
 */
-   if (register_is_null(*reg) &&
+   if (register_is_null(reg) &&
arg_type == ARG_PTR_TO_MEM_OR_NULL)
/* final test in check_stack_boundary() */;
else if (!type_is_pkt_pointer(type) &&
-- 
2.9.5



[PATCH net-next 0/7] bpf: verifier improvements and cleanups

2017-11-30 Thread Alexei Starovoitov
Small set of verifier improvements and cleanups which is
necessary for bigger patch set of bpf-to-bpf calls coming later.
See individual patches for details.
Tested on x86 and arm64 hw.

Alexei Starovoitov (7):
  bpf: fix stack state printing in verifier log
  bpf: print liveness info to verifier log
  bpf: don't mark FP reg as uninit
  bpf: improve verifier liveness marks
  bpf: improve JEQ/JNE path walking
  bpf: cleanup register_is_null()
  selftests/bpf: adjust test_align expected output

 kernel/bpf/verifier.c|  62 
 tools/testing/selftests/bpf/test_align.c | 156 +++
 2 files changed, 120 insertions(+), 98 deletions(-)

-- 
2.9.5



[PATCH net-next 4/7] bpf: improve verifier liveness marks

2017-11-30 Thread Alexei Starovoitov
registers with pointers filled from stack were missing live_written marks
which caused liveness propagation to unnecessary mark more registers as
live_read and miss state pruning opportunities later on.

 before  after
bpf_lb-DLB_L3.o   2285   2270
bpf_lb-DLB_L4.o   3723   3682
bpf_lb-DUNKNOWN.o 1110   1110
bpf_lxc-DDROP_ALL.o   27954  27876
bpf_lxc-DUNKNOWN.o38954  38780
bpf_netdev.o  16943  16937
bpf_overlay.o 7929   7929

Signed-off-by: Alexei Starovoitov 
Acked-by: John Fastabend 
Acked-by: Daniel Borkmann 
---
 kernel/bpf/verifier.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 14ad7c6e806a..46ff4e5b3fb7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -795,6 +795,11 @@ static int check_stack_read(struct bpf_verifier_env *env,
if (value_regno >= 0) {
/* restore register state from stack */
state->regs[value_regno] = 
state->stack[spi].spilled_ptr;
+   /* mark reg as written since spilled pointer state 
likely
+* has its liveness marks cleared by is_state_visited()
+* which resets stack/reg liveness for state transitions
+*/
+   state->regs[value_regno].live |= REG_LIVE_WRITTEN;
mark_stack_slot_read(state, spi);
}
return 0;
-- 
2.9.5



[PATCH net-next 1/7] bpf: fix stack state printing in verifier log

2017-11-30 Thread Alexei Starovoitov
fix incorrect stack state prints in print_verifier_state()

Fixes: 638f5b90d460 ("bpf: reduce verifier memory consumption")
Signed-off-by: Alexei Starovoitov 
Acked-by: John Fastabend 
Acked-by: Daniel Borkmann 
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d4593571c404..71a9429fdbb5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -279,7 +279,7 @@ static void print_verifier_state(struct bpf_verifier_env 
*env,
for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
if (state->stack[i].slot_type[0] == STACK_SPILL)
verbose(env, " fp%d=%s",
-   -MAX_BPF_STACK + i * BPF_REG_SIZE,
+   (-i - 1) * BPF_REG_SIZE,
reg_type_str[state->stack[i].spilled_ptr.type]);
}
verbose(env, "\n");
-- 
2.9.5



[PATCH net-next 5/7] bpf: improve JEQ/JNE path walking

2017-11-30 Thread Alexei Starovoitov
verifier knows how to trim paths that are known not to be
taken at run-time when register containing run-time constant
is compared with another constant.
It was done only for JEQ comparison.
Extend it to include JNE as well.
More cases can be added in the future.

 before  after
bpf_lb-DLB_L3.o   22702051
bpf_lb-DLB_L4.o   36823287
bpf_lb-DUNKNOWN.o 11101080
bpf_lxc-DDROP_ALL.o   27876   24980
bpf_lxc-DUNKNOWN.o38780   34308
bpf_netdev.o  16937   15404
bpf_overlay.o 79297191

Signed-off-by: Alexei Starovoitov 
Acked-by: John Fastabend 
Acked-by: Daniel Borkmann 
---
 kernel/bpf/verifier.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 46ff4e5b3fb7..afe9a1a0a5fe 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2955,8 +2955,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (BPF_SRC(insn->code) == BPF_K &&
(opcode == BPF_JEQ || opcode == BPF_JNE) &&
dst_reg->type == SCALAR_VALUE &&
-   tnum_equals_const(dst_reg->var_off, insn->imm)) {
-   if (opcode == BPF_JEQ) {
+   tnum_is_const(dst_reg->var_off)) {
+   if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) 
||
+   (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) 
{
/* if (imm == imm) goto pc+off;
 * only follow the goto, ignore fall-through
 */
-- 
2.9.5



[PATCH net-next 7/7] selftests/bpf: adjust test_align expected output

2017-11-30 Thread Alexei Starovoitov
since verifier started to print liveness state of the registers
adjust expected output of test_align.
Now this test checks for both proper alignment handling by verifier
and correctness of liveness marks.

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
---
 tools/testing/selftests/bpf/test_align.c | 156 +++
 1 file changed, 78 insertions(+), 78 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_align.c 
b/tools/testing/selftests/bpf/test_align.c
index 8591c89c0828..fe916d29e166 100644
--- a/tools/testing/selftests/bpf/test_align.c
+++ b/tools/testing/selftests/bpf/test_align.c
@@ -64,11 +64,11 @@ static struct bpf_align_test tests[] = {
.matches = {
{1, "R1=ctx(id=0,off=0,imm=0)"},
{1, "R10=fp0"},
-   {1, "R3=inv2"},
-   {2, "R3=inv4"},
-   {3, "R3=inv8"},
-   {4, "R3=inv16"},
-   {5, "R3=inv32"},
+   {1, "R3_w=inv2"},
+   {2, "R3_w=inv4"},
+   {3, "R3_w=inv8"},
+   {4, "R3_w=inv16"},
+   {5, "R3_w=inv32"},
},
},
{
@@ -92,17 +92,17 @@ static struct bpf_align_test tests[] = {
.matches = {
{1, "R1=ctx(id=0,off=0,imm=0)"},
{1, "R10=fp0"},
-   {1, "R3=inv1"},
-   {2, "R3=inv2"},
-   {3, "R3=inv4"},
-   {4, "R3=inv8"},
-   {5, "R3=inv16"},
-   {6, "R3=inv1"},
-   {7, "R4=inv32"},
-   {8, "R4=inv16"},
-   {9, "R4=inv8"},
-   {10, "R4=inv4"},
-   {11, "R4=inv2"},
+   {1, "R3_w=inv1"},
+   {2, "R3_w=inv2"},
+   {3, "R3_w=inv4"},
+   {4, "R3_w=inv8"},
+   {5, "R3_w=inv16"},
+   {6, "R3_w=inv1"},
+   {7, "R4_w=inv32"},
+   {8, "R4_w=inv16"},
+   {9, "R4_w=inv8"},
+   {10, "R4_w=inv4"},
+   {11, "R4_w=inv2"},
},
},
{
@@ -121,12 +121,12 @@ static struct bpf_align_test tests[] = {
.matches = {
{1, "R1=ctx(id=0,off=0,imm=0)"},
{1, "R10=fp0"},
-   {1, "R3=inv4"},
-   {2, "R3=inv8"},
-   {3, "R3=inv10"},
-   {4, "R4=inv8"},
-   {5, "R4=inv12"},
-   {6, "R4=inv14"},
+   {1, "R3_w=inv4"},
+   {2, "R3_w=inv8"},
+   {3, "R3_w=inv10"},
+   {4, "R4_w=inv8"},
+   {5, "R4_w=inv12"},
+   {6, "R4_w=inv14"},
},
},
{
@@ -143,10 +143,10 @@ static struct bpf_align_test tests[] = {
.matches = {
{1, "R1=ctx(id=0,off=0,imm=0)"},
{1, "R10=fp0"},
-   {1, "R3=inv7"},
-   {2, "R3=inv7"},
-   {3, "R3=inv14"},
-   {4, "R3=inv56"},
+   {1, "R3_w=inv7"},
+   {2, "R3_w=inv7"},
+   {3, "R3_w=inv14"},
+   {4, "R3_w=inv56"},
},
},
 
@@ -185,18 +185,18 @@ static struct bpf_align_test tests[] = {
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.matches = {
{7, "R0=pkt(id=0,off=8,r=8,imm=0)"},
-   {7, "R3=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-   {8, "R3=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
-   {9, "R3=inv(id=0,umax_value=1020,var_off=(0x0; 
0x3fc))"},
-   {10, "R3=inv(id=0,umax_value=2040,var_off=(0x0; 
0x7f8))"},
-   {11, "R3=inv(id=0,umax_value=4080,var_off=(0x0; 
0xff0))"},
+   {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 
0xff))"},
+   {8, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 
0x1fe))"},
+   {9, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 
0x3fc))"},
+   {10, "R3_w=inv(id=0,umax_value=2040,var_off=(0x0; 
0x7f8))"},
+   {11, "R3_w=inv(id=0,umax_value=4080,var_off=(0x0; 
0xff0))"},
{18, "R3=pkt_end(id=0,off=0,imm=0)"},
-   {18, "R4=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-   {19, "R4=inv(id=0,umax_value=8160,var_off=(0x0; 

[PATCH net-next 2/7] bpf: print liveness info to verifier log

2017-11-30 Thread Alexei Starovoitov
let verifier print register and stack liveness information
into verifier log

Signed-off-by: Alexei Starovoitov 
Acked-by: John Fastabend 
Acked-by: Daniel Borkmann 
---
 kernel/bpf/verifier.c | 24 
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 71a9429fdbb5..f7229390c279 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -216,6 +216,17 @@ static const char * const reg_type_str[] = {
[PTR_TO_PACKET_END] = "pkt_end",
 };
 
+static void print_liveness(struct bpf_verifier_env *env,
+  enum bpf_reg_liveness live)
+{
+   if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN))
+   verbose(env, "_");
+   if (live & REG_LIVE_READ)
+   verbose(env, "r");
+   if (live & REG_LIVE_WRITTEN)
+   verbose(env, "w");
+}
+
 static void print_verifier_state(struct bpf_verifier_env *env,
 struct bpf_verifier_state *state)
 {
@@ -228,7 +239,9 @@ static void print_verifier_state(struct bpf_verifier_env 
*env,
t = reg->type;
if (t == NOT_INIT)
continue;
-   verbose(env, " R%d=%s", i, reg_type_str[t]);
+   verbose(env, " R%d", i);
+   print_liveness(env, reg->live);
+   verbose(env, "=%s", reg_type_str[t]);
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
tnum_is_const(reg->var_off)) {
/* reg->off should be 0 for SCALAR_VALUE */
@@ -277,10 +290,13 @@ static void print_verifier_state(struct bpf_verifier_env 
*env,
}
}
for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-   if (state->stack[i].slot_type[0] == STACK_SPILL)
-   verbose(env, " fp%d=%s",
-   (-i - 1) * BPF_REG_SIZE,
+   if (state->stack[i].slot_type[0] == STACK_SPILL) {
+   verbose(env, " fp%d",
+   (-i - 1) * BPF_REG_SIZE);
+   print_liveness(env, state->stack[i].spilled_ptr.live);
+   verbose(env, "=%s",
reg_type_str[state->stack[i].spilled_ptr.type]);
+   }
}
verbose(env, "\n");
 }
-- 
2.9.5



Re: [RFC] virtio-net: help live migrate SR-IOV devices

2017-11-30 Thread Michael S. Tsirkin
On Thu, Nov 30, 2017 at 12:48:22PM -0800, Jakub Kicinski wrote:
> On Thu, 30 Nov 2017 15:54:40 +0200, Michael S. Tsirkin wrote:
> > On Wed, Nov 29, 2017 at 07:51:38PM -0800, Jakub Kicinski wrote:
> > > On Thu, 30 Nov 2017 11:29:56 +0800, Jason Wang wrote:  
> > > > On 2017年11月29日 03:27, Jesse Brandeburg wrote:  
> > > > > Hi, I'd like to get some feedback on a proposal to enhance virtio-net
> > > > > to ease configuration of a VM and that would enable live migration of
> > > > > passthrough network SR-IOV devices.
> > > > >
> > > > > Today we have SR-IOV network devices (VFs) that can be passed into a 
> > > > > VM
> > > > > in order to enable high performance networking direct within the VM.
> > > > > The problem I am trying to address is that this configuration is
> > > > > generally difficult to live-migrate.  There is documentation [1]
> > > > > indicating that some OS/Hypervisor vendors will support live migration
> > > > > of a system with a direct assigned networking device.  The problem I
> > > > > see with these implementations is that the network configuration
> > > > > requirements that are passed on to the owner of the VM are quite
> > > > > complicated.  You have to set up bonding, you have to configure it to
> > > > > enslave two interfaces, those interfaces (one is virtio-net, the other
> > > > > is SR-IOV device/driver like ixgbevf) must support MAC address changes
> > > > > requested in the VM, and on and on...
> > > > >
> > > > > So, on to the proposal:
> > > > > Modify virtio-net driver to be a single VM network device that
> > > > > enslaves an SR-IOV network device (inside the VM) with the same MAC
> > > > > address. This would cause the virtio-net driver to appear and work 
> > > > > like
> > > > > a simplified bonding/team driver.  The live migration problem would be
> > > > > solved just like today's bonding solution, but the VM user's 
> > > > > networking
> > > > > config would be greatly simplified.
> > > > >
> > > > > At it's simplest, it would appear something like this in the VM.
> > > > >
> > > > > ==
> > > > > = vnet0  =
> > > > >   =
> > > > > (virtio- =   |
> > > > >   net)=   |
> > > > >   =  ==
> > > > >   =  = ixgbef =
> > > > > ==  ==
> > > > >
> > > > > (forgive the ASCII art)
> > > > >
> > > > > The fast path traffic would prefer the ixgbevf or other SR-IOV device
> > > > > path, and fall back to virtio's transmit/receive when migrating.
> > > > >
> > > > > Compared to today's options this proposal would
> > > > > 1) make virtio-net more sticky, allow fast path traffic at SR-IOV
> > > > > speeds
> > > > > 2) simplify end user configuration in the VM (most if not all of the
> > > > > set up to enable migration would be done in the hypervisor)
> > > > > 3) allow live migration via a simple link down and maybe a PCI
> > > > > hot-unplug of the SR-IOV device, with failover to the virtio-net
> > > > > driver core
> > > > > 4) allow vendor agnostic hardware acceleration, and live migration
> > > > > between vendors if the VM os has driver support for all the 
> > > > > required
> > > > > SR-IOV devices.
> > > > >
> > > > > Runtime operation proposed:
> > > > > -  virtio-net driver loads, SR-IOV driver loads
> > > > > - virtio-net finds other NICs that match it's MAC address by
> > > > >both examining existing interfaces, and sets up a new device 
> > > > > notifier
> > > > > - virtio-net enslaves the first NIC with the same MAC address
> > > > > - virtio-net brings up the slave, and makes it the "preferred" path
> > > > > - virtio-net follows the behavior of an active backup bond/team
> > > > > - virtio-net acts as the interface to the VM
> > > > > - live migration initiates
> > > > > - link goes down on SR-IOV, or SR-IOV device is removed
> > > > > - failover to virtio-net as primary path
> > > > > - migration continues to new host
> > > > > - new host is started with virio-net as primary
> > > > > - if no SR-IOV, virtio-net stays primary
> > > > > - hypervisor can hot-add SR-IOV NIC, with same MAC addr as virtio
> > > > > - virtio-net notices new NIC and starts over at enslave step above
> > > > >
> > > > > Future ideas (brainstorming):
> > > > > - Optimize Fast east-west by having special rules to direct east-west
> > > > >traffic through virtio-net traffic path
> > > > >
> > > > > Thanks for reading!
> > > > > Jesse
> > > > 
> > > > Cc netdev.
> > > > 
> > > > Interesting, and this method is actually used by netvsc now:
> > > > 
> > > > commit 0c195567a8f6e82ea5535cd9f1d54a1626dd233e
> > > > Author: stephen hemminger 
> > > > Date:   Tue Aug 1 19:58:53 2017 -0700
> > > > 
> > > >      netvsc: transparent VF management
> > > > 
> > > >      This patch implements transparent fail over from synthetic NIC to
> > > >      SR-IOV virtual function NIC in Hyper-V environment. It is a better
> > > >      alternative to using bonding as 

Re: netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'

2017-11-30 Thread Al Viro
On Fri, Dec 01, 2017 at 03:48:59AM +, Al Viro wrote:

> Something similar to get_prog_path_type() above might make for a usable
> primitive, IMO...

Incidentally, bpf_obj_get_user()/bpf_obj_do_get() should just use
user_path(), rather than wanking with 
getname()+kern_path(pname->name)+putname().
Note that kern_path() will do getname_kernel() to get struct pathname...

Would cause problems for tracepoints in there, though.  And that, BTW,
is precisely why I don't want tracepoints in core VFS, TYVM - makes
restructuring the code harder...


Re: [PATCH] netfilter: add overflow checks in xt_bpf.c

2017-11-30 Thread Willem de Bruijn
On Thu, Nov 30, 2017 at 11:08 PM, Jann Horn  wrote:
> On Fri, Dec 1, 2017 at 5:04 AM, Willem de Bruijn
>  wrote:
>> On Thu, Nov 30, 2017 at 7:46 PM, Jann Horn  wrote:
>>> Check whether inputs from userspace are too long (explicit length field too
>>> big or string not null-terminated) to avoid out-of-bounds reads.
>>>
>>> As far as I can tell, this can at worst lead to very limited kernel heap
>>> memory disclosure or oopses.
>>>
>>> This bug can be triggered by an unprivileged user even if the xt_bpf module
>>> is not loaded: iptables is available in network namespaces, and the xt_bpf
>>> module can be autoloaded.
>>>
>>> Triggering the bug with a classic BPF filter with fake length 0x1000 causes
>>> the following KASAN report:
>>>
>>> ==
>>> BUG: KASAN: slab-out-of-bounds in bpf_prog_create+0x84/0xf0
>>> Read of size 32768 at addr 8801eff2c494 by task test/4627
>>>
>>> CPU: 0 PID: 4627 Comm: test Not tainted 4.15.0-rc1+ #1
>>> [...]
>>> Call Trace:
>>>  dump_stack+0x5c/0x85
>>>  print_address_description+0x6a/0x260
>>>  kasan_report+0x254/0x370
>>>  ? bpf_prog_create+0x84/0xf0
>>>  memcpy+0x1f/0x50
>>>  bpf_prog_create+0x84/0xf0
>>>  bpf_mt_check+0x90/0xd6 [xt_bpf]
>>> [...]
>>> Allocated by task 4627:
>>>  kasan_kmalloc+0xa0/0xd0
>>>  __kmalloc_node+0x47/0x60
>>>  xt_alloc_table_info+0x41/0x70 [x_tables]
>>> [...]
>>> The buggy address belongs to the object at 8801eff2c3c0
>>> which belongs to the cache kmalloc-2048 of size 2048
>>> The buggy address is located 212 bytes inside of
>>> 2048-byte region [8801eff2c3c0, 8801eff2cbc0)
>>> [...]
>>> ==
>>>
>>> Fixes: e6f30c731718 ("netfilter: x_tables: add xt_bpf match")
>>> Signed-off-by: Jann Horn 
>>> ---
>>>  net/netfilter/xt_bpf.c | 6 ++
>>>  1 file changed, 6 insertions(+)
>>>
>>> diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
>>> index 041da0d9c06f..1f7fbd3c7e5a 100644
>>> --- a/net/netfilter/xt_bpf.c
>>> +++ b/net/netfilter/xt_bpf.c
>>> @@ -27,6 +27,9 @@ static int __bpf_mt_check_bytecode(struct sock_filter 
>>> *insns, __u16 len,
>>>  {
>>> struct sock_fprog_kern program;
>>>
>>> +   if (len > XT_BPF_MAX_NUM_INSTR)
>>> +   return -EINVAL;
>>> +
>>> program.len = len;
>>> program.filter = insns;
>>
>> Next, this calls bpf_prog_create, which calls bpf_check_basics_ok to verify 
>> len.
>
> Irrelevant:
>
>  - see the KASAN splat in the commit message
>  - bpf_check_basics_ok checks against BPF_MAXINSNS (4096), but a check against
>XT_BPF_MAX_NUM_INSTR (64) is needed because that's the size of the
> member in the
>input struct

Argh, of course. Thanks.


Re: [PATCH] netfilter: add overflow checks in xt_bpf.c

2017-11-30 Thread Jann Horn
On Fri, Dec 1, 2017 at 5:04 AM, Willem de Bruijn
 wrote:
> On Thu, Nov 30, 2017 at 7:46 PM, Jann Horn  wrote:
>> Check whether inputs from userspace are too long (explicit length field too
>> big or string not null-terminated) to avoid out-of-bounds reads.
>>
>> As far as I can tell, this can at worst lead to very limited kernel heap
>> memory disclosure or oopses.
>>
>> This bug can be triggered by an unprivileged user even if the xt_bpf module
>> is not loaded: iptables is available in network namespaces, and the xt_bpf
>> module can be autoloaded.
>>
>> Triggering the bug with a classic BPF filter with fake length 0x1000 causes
>> the following KASAN report:
>>
>> ==
>> BUG: KASAN: slab-out-of-bounds in bpf_prog_create+0x84/0xf0
>> Read of size 32768 at addr 8801eff2c494 by task test/4627
>>
>> CPU: 0 PID: 4627 Comm: test Not tainted 4.15.0-rc1+ #1
>> [...]
>> Call Trace:
>>  dump_stack+0x5c/0x85
>>  print_address_description+0x6a/0x260
>>  kasan_report+0x254/0x370
>>  ? bpf_prog_create+0x84/0xf0
>>  memcpy+0x1f/0x50
>>  bpf_prog_create+0x84/0xf0
>>  bpf_mt_check+0x90/0xd6 [xt_bpf]
>> [...]
>> Allocated by task 4627:
>>  kasan_kmalloc+0xa0/0xd0
>>  __kmalloc_node+0x47/0x60
>>  xt_alloc_table_info+0x41/0x70 [x_tables]
>> [...]
>> The buggy address belongs to the object at 8801eff2c3c0
>> which belongs to the cache kmalloc-2048 of size 2048
>> The buggy address is located 212 bytes inside of
>> 2048-byte region [8801eff2c3c0, 8801eff2cbc0)
>> [...]
>> ==
>>
>> Fixes: e6f30c731718 ("netfilter: x_tables: add xt_bpf match")
>> Signed-off-by: Jann Horn 
>> ---
>>  net/netfilter/xt_bpf.c | 6 ++
>>  1 file changed, 6 insertions(+)
>>
>> diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
>> index 041da0d9c06f..1f7fbd3c7e5a 100644
>> --- a/net/netfilter/xt_bpf.c
>> +++ b/net/netfilter/xt_bpf.c
>> @@ -27,6 +27,9 @@ static int __bpf_mt_check_bytecode(struct sock_filter 
>> *insns, __u16 len,
>>  {
>> struct sock_fprog_kern program;
>>
>> +   if (len > XT_BPF_MAX_NUM_INSTR)
>> +   return -EINVAL;
>> +
>> program.len = len;
>> program.filter = insns;
>
> Next, this calls bpf_prog_create, which calls bpf_check_basics_ok to verify 
> len.

Irrelevant:

 - see the KASAN splat in the commit message
 - bpf_check_basics_ok checks against BPF_MAXINSNS (4096), but a check against
   XT_BPF_MAX_NUM_INSTR (64) is needed because that's the size of the
member in the
   input struct


Re: [PATCH] netfilter: add overflow checks in xt_bpf.c

2017-11-30 Thread Willem de Bruijn
On Thu, Nov 30, 2017 at 7:46 PM, Jann Horn  wrote:
> Check whether inputs from userspace are too long (explicit length field too
> big or string not null-terminated) to avoid out-of-bounds reads.
>
> As far as I can tell, this can at worst lead to very limited kernel heap
> memory disclosure or oopses.
>
> This bug can be triggered by an unprivileged user even if the xt_bpf module
> is not loaded: iptables is available in network namespaces, and the xt_bpf
> module can be autoloaded.
>
> Triggering the bug with a classic BPF filter with fake length 0x1000 causes
> the following KASAN report:
>
> ==
> BUG: KASAN: slab-out-of-bounds in bpf_prog_create+0x84/0xf0
> Read of size 32768 at addr 8801eff2c494 by task test/4627
>
> CPU: 0 PID: 4627 Comm: test Not tainted 4.15.0-rc1+ #1
> [...]
> Call Trace:
>  dump_stack+0x5c/0x85
>  print_address_description+0x6a/0x260
>  kasan_report+0x254/0x370
>  ? bpf_prog_create+0x84/0xf0
>  memcpy+0x1f/0x50
>  bpf_prog_create+0x84/0xf0
>  bpf_mt_check+0x90/0xd6 [xt_bpf]
> [...]
> Allocated by task 4627:
>  kasan_kmalloc+0xa0/0xd0
>  __kmalloc_node+0x47/0x60
>  xt_alloc_table_info+0x41/0x70 [x_tables]
> [...]
> The buggy address belongs to the object at 8801eff2c3c0
> which belongs to the cache kmalloc-2048 of size 2048
> The buggy address is located 212 bytes inside of
> 2048-byte region [8801eff2c3c0, 8801eff2cbc0)
> [...]
> ==
>
> Fixes: e6f30c731718 ("netfilter: x_tables: add xt_bpf match")
> Signed-off-by: Jann Horn 
> ---
>  net/netfilter/xt_bpf.c | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
> index 041da0d9c06f..1f7fbd3c7e5a 100644
> --- a/net/netfilter/xt_bpf.c
> +++ b/net/netfilter/xt_bpf.c
> @@ -27,6 +27,9 @@ static int __bpf_mt_check_bytecode(struct sock_filter 
> *insns, __u16 len,
>  {
> struct sock_fprog_kern program;
>
> +   if (len > XT_BPF_MAX_NUM_INSTR)
> +   return -EINVAL;
> +
> program.len = len;
> program.filter = insns;

Next, this calls bpf_prog_create, which calls bpf_check_basics_ok to verify len.

> @@ -55,6 +58,9 @@ static int __bpf_mt_check_path(const char *path, struct 
> bpf_prog **ret)
> mm_segment_t oldfs = get_fs();
> int retval, fd;
>
> +   if (strnlen(path, XT_BPF_PATH_MAX) == XT_BPF_PATH_MAX)
> +   return -EINVAL;
> +

Good catch. It looks like this code needs a more thorough revision.

https://lkml.kernel.org/r/<20171201034859.gn21...@zeniv.linux.org.uk>


Re: [PATCH net-next 2/5] rhashtable: Add rhastable_walk_peek

2017-11-30 Thread Tom Herbert
On Thu, Nov 30, 2017 at 5:21 PM, Herbert Xu  wrote:
> On Thu, Nov 30, 2017 at 05:15:16PM -0800, Tom Herbert wrote:
>>
>> We don't need a guarantee of stability, but what I am seeing is that
>> we're consisitently dropping entries on when doing a multi-part
>> netlink walk. We start iterating over the table filling in the netlink
>> info. But eventually the netlink info fills up and returns an error.
>> netlink dump gets called again but now the iter of the table returns
>> the object following the one that would have overflowed the netlink
>> buffer. So the result I was seeing is that we dropped one object in in
>> each pass.
>
> Thanks Tom! This information is very useful.
>
> It sounds like this problem isn't specific to ila and would exist
> for all rhashtable users that dump through netlink.  Let me think
> about this a little bit more.
>
Right. Also note that the first patch is inspired by netlink dump
handling also. When we reach the end of the table (walk_next returns
NULL), we'll return a non-zero skb->len if some records have been
written to the buffer. On the next call to the dump we need to bounce
out immediately with zero length returned. Resetting the walker table
in walk start because it's NULL results in infinite loop if -EAGAIN is
ignored by the caller (rhashtable_walk_start returning void is nice
side effect of this).

Tom


> Cheers,
> --
> Email: Herbert Xu 
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'

2017-11-30 Thread Al Viro
On Fri, Dec 01, 2017 at 01:33:04AM +, Al Viro wrote:

> Use of file descriptors should be limited to "got a number from userland,
> convert to struct file *" on the way in and "install struct file * into
> descriptor table and return the descriptor to userland" on the way out.
> And the latter - *ONLY* after the last possible point of failure.  Once
> a file reference is inserted into descriptor table, that's it - you
> can't undo that.
> 
> The only way to use bpf_obj_get_user() is to pass its return value to
> userland.  As return value of syscall - not even put_user() (for that
> you'd need to reserve the descriptor, copy it to userland and only
> then attach struct file * to it).
> 
> The whole approach stinks - what it needs is something that would
> take struct filename * and return struct bpf_prog * or struct file *
> reference.  With bpf_obj_get_user() and this thing implemented
> via that.
> 
> I'm looking into that thing...

What it tries to pull off is something not far from

static struct bpf_prog *__get_prog(struct inode *inode, enum bpf_prog_type type)
{
struct bpf_prog *prog;
int err = inode_permission(inode, FMODE_READ | FMODE_WRITE);
if (err)
return ERR_PTR(err);

if (inode->i_op == _map_iops)
return ERR_PTR(-EINVAL);

if (inode->i_op != _prog_iops)
return ERR_PTR(-EACCES);

prog = inode->i_private;
err = security_bpf_prog(prog);
if (err < 0)
return ERR_PTR(err);

if (!bpf_prog_get_ok(prog, , false))
return ERR_PTR(-EINVAL);

return bpf_prog_inc(prog);
}

struct bpf_prog *get_prog_path_type(const char *name, enum bpf_prog_type type)
{
struct path path;
struct bpf_prog *prog;
int err = kern_path(name, LOOKUP_FOLLOW, );
if (err)
return ERR_PTR(err);
prog = __get_prog(d_backing_inode(path.dentry), type);
if (!IS_ERR(prog))
touch_atime();
path_put();
return prog;
}

static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
{
*ret = get_prog_path_type(path, BPF_PROG_TYPE_SOCKET_FILTER);
return PTR_ERR_OR_ZERO(*ret);
}

That skips all tracepoint random shite (pardon the triple redundance) and makes
a somewhat arbitrary change for touch_atime() logics.  And, of course, it is
not even compile-tested.

Something similar to get_prog_path_type() above might make for a usable
primitive, IMO...


[PATCH net-next] net: hns3: Refactors "reset" handling code in HCLGE layer of HNS3 driver

2017-11-30 Thread Salil Mehta
This patch refactors the code of the reset feature in HCLGE layer
of HNS3 PF driver. Prime motivation to do this change is:
1. To reduce the time for which common miscellaneous Vector 0
   interrupt is disabled because of the reset.
2. Simplification of reset request submission and pending reset
   logic.
3. Simplification of the common miscellaneous interrupt handler
   routine(for Vector 0) used to handle reset and other sources
   of Vector 0 interrupt.

To achieve above below few things have been done:
1. Interrupt is disabled while common miscellaneous interrupt
   handler is entered and re-enabled before it is exit. This
   reduces the interrupt handling latency as compared to older
   interrupt handling scheme where interrupt was being disabled
   in interrupt handler context and re-enabled in task context
   some time later.
2. Introduces new reset service task for honoring software reset
   requests like from network stack related to timeout and serving
   the pending reset request(to reset the driver and associated
   clients).
3. Made Miscellaneous interrupt handler more generic to handle
   all sources including reset interrupt source.

Signed-off-by: Salil Mehta 
Signed-off-by: lipeng 
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 229 ++---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h|  12 +-
 2 files changed, 167 insertions(+), 74 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 59ed806a52c3..063be1c50a1d 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -17,7 +17,7 @@
 #include 
 #include 
 #include 
-
+#include 
 #include "hclge_cmd.h"
 #include "hclge_dcb.h"
 #include "hclge_main.h"
@@ -2226,6 +2226,12 @@ static int hclge_mac_init(struct hclge_dev *hdev)
return hclge_cfg_func_mta_filter(hdev, 0, hdev->accept_mta_mc);
 }
 
+static void hclge_reset_task_schedule(struct hclge_dev *hdev)
+{
+   if (!test_and_set_bit(HCLGE_STATE_RST_SERVICE_SCHED, >state))
+   schedule_work(>rst_service_task);
+}
+
 static void hclge_task_schedule(struct hclge_dev *hdev)
 {
if (!test_bit(HCLGE_STATE_DOWN, >state) &&
@@ -2362,6 +2368,42 @@ static void hclge_service_complete(struct hclge_dev 
*hdev)
clear_bit(HCLGE_STATE_SERVICE_SCHED, >state);
 }
 
+static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
+{
+   u32 rst_src_reg;
+
+   /* fetch the events from their corresponding regs */
+   rst_src_reg = hclge_read_dev(>hw, HCLGE_MISC_RESET_STS_REG);
+
+   /* check for vector0 reset event sources */
+   if (BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B) & rst_src_reg) {
+   set_bit(HNAE3_GLOBAL_RESET, >reset_pending);
+   *clearval = BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B);
+   return HCLGE_VECTOR0_EVENT_RST;
+   }
+
+   if (BIT(HCLGE_VECTOR0_CORERESET_INT_B) & rst_src_reg) {
+   set_bit(HNAE3_CORE_RESET, >reset_pending);
+   *clearval = BIT(HCLGE_VECTOR0_CORERESET_INT_B);
+   return HCLGE_VECTOR0_EVENT_RST;
+   }
+
+   if (BIT(HCLGE_VECTOR0_IMPRESET_INT_B) & rst_src_reg) {
+   set_bit(HNAE3_IMP_RESET, >reset_pending);
+   *clearval = BIT(HCLGE_VECTOR0_IMPRESET_INT_B);
+   return HCLGE_VECTOR0_EVENT_RST;
+   }
+
+   return HCLGE_VECTOR0_EVENT_OTHER;
+}
+
+static void hclge_clear_event_cause(struct hclge_dev *hdev, u32 event_type,
+   u32 regclr)
+{
+   if (event_type == HCLGE_VECTOR0_EVENT_RST)
+   hclge_write_dev(>hw, HCLGE_MISC_RESET_STS_REG, regclr);
+}
+
 static void hclge_enable_vector(struct hclge_misc_vector *vector, bool enable)
 {
writel(enable ? 1 : 0, vector->addr);
@@ -2370,10 +2412,28 @@ static void hclge_enable_vector(struct 
hclge_misc_vector *vector, bool enable)
 static irqreturn_t hclge_misc_irq_handle(int irq, void *data)
 {
struct hclge_dev *hdev = data;
+   u32 event_cause;
+   u32 clearval;
 
hclge_enable_vector(>misc_vector, false);
-   if (!test_and_set_bit(HCLGE_STATE_SERVICE_SCHED, >state))
-   schedule_work(>service_task);
+   event_cause = hclge_check_event_cause(hdev, );
+
+   /* vector 0 interrupt is shared with reset and mailbox source events.
+* For now, we are not handling mailbox events.
+*/
+   switch (event_cause) {
+   case HCLGE_VECTOR0_EVENT_RST:
+   hclge_reset_task_schedule(hdev);
+   break;
+   default:
+   dev_dbg(>pdev->dev,
+   "received unknown or unhandled event of vector0\n");
+   break;
+   }
+
+   /* we should clear the source of interrupt */
+   hclge_clear_event_cause(hdev, event_cause, clearval);
+  

Re: [PATCH net 1/2] bpf: set maximum number of attached progs to 64 for a single perf tp

2017-11-30 Thread Yonghong Song



On 11/30/17 6:07 PM, Daniel Borkmann wrote:

On 11/30/2017 10:47 PM, Yonghong Song wrote:

cgropu+bpf prog array has a maximum number of 64 programs.
Let us apply the same limit here.

Signed-off-by: Yonghong Song 


Both applied to bpf tree, thanks! Please add a proper Fixes tags in the
future; took care of it this time.


Will try to remember next time :-).
Thanks for taking care of this!



Re: [PATCH net-next 3/5] bpftool: implement cgattach command

2017-11-30 Thread Jakub Kicinski
On Thu, 30 Nov 2017 13:43:00 +, Roman Gushchin wrote:
> + attach_type = parse_attach_type(argv[2]);
> + if (attach_type == __MAX_BPF_ATTACH_TYPE) {
> + bpf_object__close(obj);
> + close(prog_fd);
> + close(cgroup_fd);
> + p_err("Invalid attach type\n");
> + return -1;
> + }
> +
> + if (bpf_prog_attach(prog_fd, cgroup_fd, attach_type, 0)) {
> + bpf_object__close(obj);
> + close(prog_fd);
> + close(cgroup_fd);
> + p_err("Failed to attach program");
> + return -1;
> + }
> +
> + bpf_object__close(obj);
> + close(prog_fd);
> + close(cgroup_fd);
> +
> + return 0;
> +}

Could you try to consolidate the error paths into a one larger handler
and use gotos to jump to it?  You can see it done in number of places,
grep for e.g. exit_free.


Re: [PATCH net-next 0/5] bpftool: cgroup bpf operations

2017-11-30 Thread Jakub Kicinski
Hi Roman!

On Thu, 30 Nov 2017 13:42:57 +, Roman Gushchin wrote:
> This patchset adds basic cgroup bpf operations to bpftool.
> 
> Right now there is no convenient way to perform these operations.
> The /samples/bpf/load_sock_ops.c implements attach/detacg operations,
> but only for BPF_CGROUP_SOCK_OPS programs. Bps (part of bcc) implements
> bpf introspection, but lacks any cgroup-related specific.
> 
> I find having a tool to perform these basic operations in the kernel tree
> very useful, as it can be used in the corresponding bpf documentation
> without creating additional dependencies. And bpftool seems to be
> a right tool to extend with such functionality.

Could you place your code in a new file and add a new "object level"?
I.e. 
bpftool cgroup list 
bpftool cgroup attach ...
bpftool cgroup help
etc?  Note that you probably want the list to be first, so if someone
types "bpftool cg" it runs list by default.

Does it make sense to support pinned files and specifying programs by
id?  I used the "id"/"pinned" keywords so that users can choose to use
either.  Perhaps you should at least prefix the file to with "file"?
So:
$ bpftool cgattach file ./mybpfprog.o /sys/fs/cgroup/user.slice/ ingress
$ bpftool cgattach id 19 /sys/fs/cgroup/user.slice/ ingress
$ bpftool cgattach pin /bpf/prog /sys/fs/cgroup/user.slice/ ingress
Would this make sense?

Smaller nits on the coding style:
 - please try to run checkpatch, perhaps you did, but some people
   forget tools are in the kernel tree :)
 - please keep includes in alphabetical order;
 - please keep variable declarations in functions ordered longest to
   shortest, if that's impossible because of dependency between
   initializers - move the initializers to the code.

Please also don't forget to update/create new man page.

Thanks! :)


Re: [PATCH net 1/2] bpf: set maximum number of attached progs to 64 for a single perf tp

2017-11-30 Thread Daniel Borkmann
On 11/30/2017 10:47 PM, Yonghong Song wrote:
> cgropu+bpf prog array has a maximum number of 64 programs.
> Let us apply the same limit here.
> 
> Signed-off-by: Yonghong Song 

Both applied to bpf tree, thanks! Please add a proper Fixes tags in the
future; took care of it this time.


Re: [RFC] bpf: offload: report device information for offloaded programs

2017-11-30 Thread Jakub Kicinski
Hi Kirill,

On Thu, 30 Nov 2017 16:19:13 +0300, Kirill Tkhai wrote:
> > @@ -164,6 +166,38 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
> > return bpf_prog_offload_translate(prog);
> >  }
> >  
> > +int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
> > +  struct bpf_prog *prog)
> > +{
> > +   struct bpf_dev_offload *offload = prog->aux->offload;
> > +   struct inode *ns_inode;
> > +   struct path ns_path;
> > +   struct net *net;
> > +   int ret = 0;
> > +   void *ptr;
> > +
> > +   info->dev_bound = 1;
> > +
> > +   rtnl_lock();  
> 
> rtnl_lock() is too big lock and it is already overused in kernel.
> Can't we use smaller lock in this driver to protect bpf_prog_offload_devs?
> I suppose rwlock would be appropriate for that.
> 
> (Then, we may completely remove rtnl_lock() from bpf_prog_offload_init()
> and use readlocked dev_base_lock for __dev_get_by_index() instead and
> the new small_rwlock to link in the list.
> 
> Not sure about bpf_prog_offload_verifier_prep() and 
> bpf_prog_offload_translate()
> and which context expect net_device_ops->ndo_bpf users. Either they need rtnl
> or not).

Thanks for the comments, removing the use of rtnl_lock is definitely on
my todo list!


[PATCH net] ipvlan: Add the skb->mark as flow4's member to lookup route

2017-11-30 Thread gfree . wind
From: Gao Feng 

Current codes don't use skb->mark to assign flowi4_mark, it would
make the policy route rule with fwmark doesn't work as expected.

Signed-off-by: Gao Feng 
---
 drivers/net/ipvlan/ipvlan_core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 11c1e79..77cc4fb 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -393,6 +393,7 @@ static int ipvlan_process_v4_outbound(struct sk_buff *skb)
.flowi4_oif = dev->ifindex,
.flowi4_tos = RT_TOS(ip4h->tos),
.flowi4_flags = FLOWI_FLAG_ANYSRC,
+   .flowi4_mark = skb->mark,
.daddr = ip4h->daddr,
.saddr = ip4h->saddr,
};
-- 
1.9.1




Re: [BUG] kernel stack corruption during/after Netlabel error

2017-11-30 Thread James Morris
On Thu, 30 Nov 2017, Eric Dumazet wrote:

> On Wed, 2017-11-29 at 19:16 -0800, Casey Schaufler wrote:
> > On 11/29/2017 4:31 PM, James Morris wrote:
> > > On Wed, 29 Nov 2017, Casey Schaufler wrote:
> > > 
> > > > I see that there is a proposed fix later in the thread, but I
> > > > don't see
> > > > the patch. Could you send it to me, so I can try it on my
> > > > problem?
> > > 
> > > Forwarded off-list.
> > 
> > The patch does fix the problem I was seeing in Smack.
> 
> Can you guys test the following more complete patch ?
> 
> It should cover IPv4 and IPv6, and also the corner cases.


Tested-by: James Morris 



-- 
James Morris




Re: [PATCH v4 3/8] MIPS: Octeon: Add a global resource manager.

2017-11-30 Thread David Daney

On 11/30/2017 02:53 PM, James Hogan wrote:

On Tue, Nov 28, 2017 at 04:55:35PM -0800, David Daney wrote:

From: Carlos Munoz 

Add a global resource manager to manage tagged pointers within
bootmem allocated memory. This is used by various functional
blocks in the Octeon core like the FPA, Ethernet nexus, etc.

Signed-off-by: Carlos Munoz 
Signed-off-by: Steven J. Hill 
Signed-off-by: David Daney 
---
  arch/mips/cavium-octeon/Makefile   |   3 +-
  arch/mips/cavium-octeon/resource-mgr.c | 371 +
  arch/mips/include/asm/octeon/octeon.h  |  18 ++
  3 files changed, 391 insertions(+), 1 deletion(-)
  create mode 100644 arch/mips/cavium-octeon/resource-mgr.c

diff --git a/arch/mips/cavium-octeon/Makefile b/arch/mips/cavium-octeon/Makefile
index 7c02e542959a..0a299ab8719f 100644
--- a/arch/mips/cavium-octeon/Makefile
+++ b/arch/mips/cavium-octeon/Makefile
@@ -9,7 +9,8 @@
  # Copyright (C) 2005-2009 Cavium Networks
  #
  
-obj-y := cpu.o setup.o octeon-platform.o octeon-irq.o csrc-octeon.o

+obj-y := cpu.o setup.o octeon-platform.o octeon-irq.o csrc-octeon.o \
+resource-mgr.o


Maybe put that on a separate line like below.


OK




  obj-y += dma-octeon.o
  obj-y += octeon-memcpy.o
  obj-y += executive/
diff --git a/arch/mips/cavium-octeon/resource-mgr.c 
b/arch/mips/cavium-octeon/resource-mgr.c
new file mode 100644
index ..ca25fa953402
--- /dev/null
+++ b/arch/mips/cavium-octeon/resource-mgr.c
@@ -0,0 +1,371 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Resource manager for Octeon.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2017 Cavium, Inc.
+ */
+#include 
+
+#include 
+#include 
+
+#define RESOURCE_MGR_BLOCK_NAME"cvmx-global-resources"
+#define MAX_RESOURCES  128
+#define INST_AVAILABLE -88
+#define OWNER  0xbadc0de
+
+struct global_resource_entry {
+   struct global_resource_tag tag;
+   u64 phys_addr;
+   u64 size;
+};
+
+struct global_resources {
+#ifdef __LITTLE_ENDIAN_BITFIELD
+   u32 rlock;
+   u32 pad;
+#else
+   u32 pad;
+   u32 rlock;
+#endif
+   u64 entry_cnt;
+   struct global_resource_entry resource_entry[];
+};
+
+static struct global_resources *res_mgr_info;
+
+
+/*
+ * The resource manager interacts with software running outside of the
+ * Linux kernel, which necessitates locking to maintain data structure
+ * consistency.  These custom locking functions implement the locking
+ * protocol, and cannot be replaced by kernel locking functions that
+ * may use different in-memory structures.
+ */
+
+static void res_mgr_lock(void)
+{
+   unsigned int tmp;
+   u64 lock = (u64)_mgr_info->rlock;


presumably this could be a u32 *, avoid the cast to u64, and still work
just fine below.


I will rewrite to just use cmpxchg()





+
+   __asm__ __volatile__(
+   ".set noreorder\n"
+   "1: ll   %[tmp], 0(%[addr])\n"
+   "   bnez %[tmp], 1b\n"
+   "   li   %[tmp], 1\n"


I believe the convention for .S files is for instructions in branch
delay slots to be indented an additional space for readability. Maybe
that would be worthwhile here.


+   "   sc   %[tmp], 0(%[addr])\n"
+   "   beqz %[tmp], 1b\n"
+   "   nop\n"


and here also.


+   ".set reorder\n" :


nit: strictly speaking there's no need for \n on the last line.


+   [tmp] "="(tmp) :
+   [addr] "r"(lock) :
+   "memory");


minor style thing: its far more common to have : at the beginning of the
line rather than the end.


+}
+
+static void res_mgr_unlock(void)
+{
+   u64 lock = (u64)_mgr_info->rlock;


same again



Will rewrite to use WRITE_ONCE().


+
+   /* Wait until all resource operations finish before unlocking. */
+   mb();
+   __asm__ __volatile__(
+   "sw $0, 0(%[addr])\n" : :
+   [addr] "r"(lock) :
+   "memory");
+
+   /* Force a write buffer flush. */
+   mb();
+}
+
+static int res_mgr_find_resource(struct global_resource_tag tag)
+{
+   struct global_resource_entry *res_entry;
+   int i;
+
+   for (i = 0; i < res_mgr_info->entry_cnt; i++) {
+   res_entry = _mgr_info->resource_entry[i];
+   if (res_entry->tag.lo == tag.lo && res_entry->tag.hi == tag.hi)
+   return i;
+   }
+   return -1;
+}
+
+/**
+ * res_mgr_create_resource - Create a resource.
+ * @tag: Identifies the resource.
+ * @inst_cnt: Number of resource instances to create.
+ *
+ * Returns 0 if the source was created successfully.
+ * Returns <0 for error codes.


Only -1 seems to be returned. Is it worth 

[PATCH net-next v2 6/8] selftests/bpf: add offload test based on netdevsim

2017-11-30 Thread Jakub Kicinski
Add a test of BPF offload control path interfaces based on
just-added netdevsim driver.  Perform various checks of both
the stack and the expected driver behaviour.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Simon Horman 
Reviewed-by: Quentin Monnet 
---
 tools/testing/selftests/bpf/Makefile|   5 +-
 tools/testing/selftests/bpf/sample_ret0.c   |   7 +
 tools/testing/selftests/bpf/test_offload.py | 681 
 3 files changed, 691 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/sample_ret0.c
 create mode 100755 tools/testing/selftests/bpf/test_offload.py

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 333a48655ee0..2c9d8c63c6fa 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -17,9 +17,10 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps 
test_lru_map test_lpm_map test
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o 
test_obj_id.o \
test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o 
sockmap_parse_prog.o \
-   sockmap_verdict_prog.o dev_cgroup.o
+   sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o
 
-TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh
+TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
+   test_offload.py
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/bpf/sample_ret0.c 
b/tools/testing/selftests/bpf/sample_ret0.c
new file mode 100644
index ..fec99750d6ea
--- /dev/null
+++ b/tools/testing/selftests/bpf/sample_ret0.c
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */
+
+/* Sample program which should always load for testing control paths. */
+int func()
+{
+   return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_offload.py 
b/tools/testing/selftests/bpf/test_offload.py
new file mode 100755
index ..3914f7a4585a
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_offload.py
@@ -0,0 +1,681 @@
+#!/usr/bin/python3
+
+# Copyright (C) 2017 Netronome Systems, Inc.
+#
+# This software is licensed under the GNU General License Version 2,
+# June 1991 as shown in the file COPYING in the top-level directory of this
+# source tree.
+#
+# THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
+# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
+# OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+# THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+from datetime import datetime
+import argparse
+import json
+import os
+import pprint
+import subprocess
+import time
+
+logfile = None
+log_level = 1
+bpf_test_dir = os.path.dirname(os.path.realpath(__file__))
+pp = pprint.PrettyPrinter()
+devs = [] # devices we created for clean up
+files = [] # files to be removed
+
+def log_get_sec(level=0):
+return "*" * (log_level + level)
+
+def log_level_inc(add=1):
+global log_level
+log_level += add
+
+def log_level_dec(sub=1):
+global log_level
+log_level -= sub
+
+def log_level_set(level):
+global log_level
+log_level = level
+
+def log(header, data, level=None):
+"""
+Output to an optional log.
+"""
+if logfile is None:
+return
+if level is not None:
+log_level_set(level)
+
+if not isinstance(data, str):
+data = pp.pformat(data)
+
+if len(header):
+logfile.write("\n" + log_get_sec() + " ")
+logfile.write(header)
+if len(header) and len(data.strip()):
+logfile.write("\n")
+logfile.write(data)
+
+def skip(cond, msg):
+if not cond:
+return
+print("SKIP: " + msg)
+log("SKIP: " + msg, "", level=1)
+os.sys.exit(0)
+
+def fail(cond, msg):
+if not cond:
+return
+print("FAIL: " + msg)
+log("FAIL: " + msg, "", level=1)
+os.sys.exit(1)
+
+def start_test(msg):
+log(msg, "", level=1)
+log_level_inc()
+print(msg)
+
+def cmd(cmd, shell=True, include_stderr=False, background=False, fail=True):
+"""
+Run a command in subprocess and return tuple of (retval, stdout);
+optionally return stderr as well as third value.
+"""
+proc = subprocess.Popen(cmd, shell=shell, stdout=subprocess.PIPE,
+stderr=subprocess.PIPE)
+if background:
+msg = "%s START: %s" % (log_get_sec(1),
+datetime.now().strftime("%H:%M:%S.%f"))
+log("BKG " + proc.args, msg)
+return proc
+
+return cmd_result(proc, include_stderr=include_stderr, fail=fail)
+
+def cmd_result(proc, include_stderr=False, fail=False):
+stdout, stderr = proc.communicate()
+stdout 

[PATCH net-next v2 2/8] net: xdp: report flags program was installed with on query

2017-11-30 Thread Jakub Kicinski
Some drivers enforce that flags on program replacement and
removal must match the flags passed on install.  This leaves
the possibility open to enable simultaneous loading
of XDP programs both to HW and DRV.

Allow such drivers to report the flags back to the stack.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Simon Horman 
Reviewed-by: Quentin Monnet 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 1 +
 include/linux/netdevice.h   | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1a603fdd9e80..ea6bbf1efefc 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3392,6 +3392,7 @@ static int nfp_net_xdp(struct net_device *netdev, struct 
netdev_bpf *xdp)
if (nn->dp.bpf_offload_xdp)
xdp->prog_attached = XDP_ATTACHED_HW;
xdp->prog_id = nn->xdp_prog ? nn->xdp_prog->aux->id : 0;
+   xdp->flags = nn->xdp_prog ? nn->xdp_flags : 0;
return 0;
case BPF_OFFLOAD_VERIFIER_PREP:
return nfp_app_bpf_verifier_prep(nn->app, nn, xdp);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 667bdd3ad33e..cc4ce7456e38 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -820,6 +820,8 @@ struct netdev_bpf {
struct {
u8 prog_attached;
u32 prog_id;
+   /* flags with which program was installed */
+   u32 prog_flags;
};
/* BPF_OFFLOAD_VERIFIER_PREP */
struct {
-- 
2.14.1



[PATCH net-next v2 0/8] xdp: make stack perform remove and tests

2017-11-30 Thread Jakub Kicinski
Hi!

The purpose of this series is to add a software model of BPF offloads
to make it easier for everyone to test them and make some of the more
arcane rules and assumptions more clear.

The series starts with 3 patches aiming to make XDP handling in the
drivers less error prone.  Currently driver authors have to remember
to free XDP programs if XDP is active during unregister.  With this
series the core will disable XDP on its own.  It will take place
after close, drivers are not expected to perform reconfiguration
when disabling XDP on a downed device.

Next two patches add the software netdev driver, followed by a python 
test which exercises all the corner cases which came to my mind.

Test needs to be run as root.  It will print basic information to
stdout, but can also create a more detailed log of all commands
when --log option is passed.  Log is in Emacs Org-mode format.

  ./tools/testing/selftests/bpf/test_offload.py --log /tmp/log

Last two patches replace the SR-IOV API implementation of dummy.

v2:
 - free device from the release function;
 - use bus-based name generatin instead of netdev name.
v1:
 - replace the SR-IOV API implementation of dummy;
 - make the dev_xdp_uninstall() also handle the XDP generic (Daniel).


Jakub Kicinski (8):
  net: xdp: avoid output parameters when querying XDP prog
  net: xdp: report flags program was installed with on query
  net: xdp: make the stack take care of the tear down
  netdevsim: add software driver for testing offloads
  netdevsim: add bpf offload support
  selftests/bpf: add offload test based on netdevsim
  netdevsim: add SR-IOV functionality
  net: dummy: remove fake SR-IOV functionality

 MAINTAINERS|   5 +
 drivers/net/Kconfig|  11 +
 drivers/net/Makefile   |   1 +
 drivers/net/dummy.c| 215 +--
 drivers/net/ethernet/broadcom/bnxt/bnxt.c  |   2 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 -
 drivers/net/ethernet/netronome/nfp/bpf/main.c  |   7 -
 .../net/ethernet/netronome/nfp/nfp_net_common.c|   4 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c   |   4 -
 drivers/net/netdevsim/Makefile |   7 +
 drivers/net/netdevsim/bpf.c| 373 +++
 drivers/net/netdevsim/netdev.c | 501 +++
 drivers/net/netdevsim/netdevsim.h  |  78 +++
 drivers/net/tun.c  |   4 -
 include/linux/netdevice.h  |   5 +-
 net/core/dev.c |  53 +-
 net/core/rtnetlink.c   |   6 +-
 tools/testing/selftests/bpf/Makefile   |   5 +-
 tools/testing/selftests/bpf/sample_ret0.c  |   7 +
 tools/testing/selftests/bpf/test_offload.py| 681 +
 20 files changed, 1714 insertions(+), 258 deletions(-)
 create mode 100644 drivers/net/netdevsim/Makefile
 create mode 100644 drivers/net/netdevsim/bpf.c
 create mode 100644 drivers/net/netdevsim/netdev.c
 create mode 100644 drivers/net/netdevsim/netdevsim.h
 create mode 100644 tools/testing/selftests/bpf/sample_ret0.c
 create mode 100755 tools/testing/selftests/bpf/test_offload.py

-- 
2.14.1



[PATCH net-next v2 8/8] net: dummy: remove fake SR-IOV functionality

2017-11-30 Thread Jakub Kicinski
netdevsim driver seems like a better place for fake SR-IOV
functionality.  Remove the code previously added to dummy.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
CC: Phil Sutter 
CC: Sabrina Dubroca  
---
 drivers/net/dummy.c | 215 +---
 1 file changed, 1 insertion(+), 214 deletions(-)

diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index 58483af80bdb..30b1c8512049 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -42,48 +42,7 @@
 #define DRV_NAME   "dummy"
 #define DRV_VERSION"1.0"
 
-#undef pr_fmt
-#define pr_fmt(fmt) DRV_NAME ": " fmt
-
 static int numdummies = 1;
-static int num_vfs;
-
-struct vf_data_storage {
-   u8  vf_mac[ETH_ALEN];
-   u16 pf_vlan; /* When set, guest VLAN config not allowed. */
-   u16 pf_qos;
-   __be16  vlan_proto;
-   u16 min_tx_rate;
-   u16 max_tx_rate;
-   u8  spoofchk_enabled;
-   boolrss_query_enabled;
-   u8  trusted;
-   int link_state;
-};
-
-struct dummy_priv {
-   struct vf_data_storage  *vfinfo;
-};
-
-static int dummy_num_vf(struct device *dev)
-{
-   return num_vfs;
-}
-
-static struct bus_type dummy_bus = {
-   .name   = "dummy",
-   .num_vf = dummy_num_vf,
-};
-
-static void release_dummy_parent(struct device *dev)
-{
-}
-
-static struct device dummy_parent = {
-   .init_name  = "dummy",
-   .bus= _bus,
-   .release= release_dummy_parent,
-};
 
 /* fake multicast ability */
 static void set_multicast_list(struct net_device *dev)
@@ -133,25 +92,10 @@ static netdev_tx_t dummy_xmit(struct sk_buff *skb, struct 
net_device *dev)
 
 static int dummy_dev_init(struct net_device *dev)
 {
-   struct dummy_priv *priv = netdev_priv(dev);
-
dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
if (!dev->dstats)
return -ENOMEM;
 
-   priv->vfinfo = NULL;
-
-   if (!num_vfs)
-   return 0;
-
-   dev->dev.parent = _parent;
-   priv->vfinfo = kcalloc(num_vfs, sizeof(struct vf_data_storage),
-  GFP_KERNEL);
-   if (!priv->vfinfo) {
-   free_percpu(dev->dstats);
-   return -ENOMEM;
-   }
-
return 0;
 }
 
@@ -169,117 +113,6 @@ static int dummy_change_carrier(struct net_device *dev, 
bool new_carrier)
return 0;
 }
 
-static int dummy_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
-{
-   struct dummy_priv *priv = netdev_priv(dev);
-
-   if (!is_valid_ether_addr(mac) || (vf >= num_vfs))
-   return -EINVAL;
-
-   memcpy(priv->vfinfo[vf].vf_mac, mac, ETH_ALEN);
-
-   return 0;
-}
-
-static int dummy_set_vf_vlan(struct net_device *dev, int vf,
-u16 vlan, u8 qos, __be16 vlan_proto)
-{
-   struct dummy_priv *priv = netdev_priv(dev);
-
-   if ((vf >= num_vfs) || (vlan > 4095) || (qos > 7))
-   return -EINVAL;
-
-   priv->vfinfo[vf].pf_vlan = vlan;
-   priv->vfinfo[vf].pf_qos = qos;
-   priv->vfinfo[vf].vlan_proto = vlan_proto;
-
-   return 0;
-}
-
-static int dummy_set_vf_rate(struct net_device *dev, int vf, int min, int max)
-{
-   struct dummy_priv *priv = netdev_priv(dev);
-
-   if (vf >= num_vfs)
-   return -EINVAL;
-
-   priv->vfinfo[vf].min_tx_rate = min;
-   priv->vfinfo[vf].max_tx_rate = max;
-
-   return 0;
-}
-
-static int dummy_set_vf_spoofchk(struct net_device *dev, int vf, bool val)
-{
-   struct dummy_priv *priv = netdev_priv(dev);
-
-   if (vf >= num_vfs)
-   return -EINVAL;
-
-   priv->vfinfo[vf].spoofchk_enabled = val;
-
-   return 0;
-}
-
-static int dummy_set_vf_rss_query_en(struct net_device *dev, int vf, bool val)
-{
-   struct dummy_priv *priv = netdev_priv(dev);
-
-   if (vf >= num_vfs)
-   return -EINVAL;
-
-   priv->vfinfo[vf].rss_query_enabled = val;
-
-   return 0;
-}
-
-static int dummy_set_vf_trust(struct net_device *dev, int vf, bool val)
-{
-   struct dummy_priv *priv = netdev_priv(dev);
-
-   if (vf >= num_vfs)
-   return -EINVAL;
-
-   priv->vfinfo[vf].trusted = val;
-
-   return 0;
-}
-
-static int dummy_get_vf_config(struct net_device *dev,
-  int vf, struct ifla_vf_info *ivi)
-{
-   struct dummy_priv *priv = netdev_priv(dev);
-
-   if (vf >= num_vfs)
-   return -EINVAL;
-
-   ivi->vf = vf;
-   memcpy(>mac, priv->vfinfo[vf].vf_mac, ETH_ALEN);
-   ivi->vlan = priv->vfinfo[vf].pf_vlan;
-   ivi->qos = priv->vfinfo[vf].pf_qos;
-   ivi->spoofchk = priv->vfinfo[vf].spoofchk_enabled;
-   ivi->linkstate = priv->vfinfo[vf].link_state;
-   ivi->min_tx_rate = priv->vfinfo[vf].min_tx_rate;
-   ivi->max_tx_rate = 

[PATCH net-next v2 7/8] netdevsim: add SR-IOV functionality

2017-11-30 Thread Jakub Kicinski
dummy driver was extended with VF-related netdev APIs for testing
SR-IOV-related software.  netdevsim did not exist back then.
Implement SR-IOV functionality in netdevsim.  Notable difference
is that since netdevsim has no module parameters, we will actually
create a device with sriov_numvfs attribute for each netdev.
The zero MAC address is accepted as some HW use it to mean any
address is allowed.  Link state is also now validated.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
CC: Phil Sutter 
CC: Sabrina Dubroca  
---
 drivers/net/netdevsim/netdev.c| 273 +-
 drivers/net/netdevsim/netdevsim.h |  12 ++
 2 files changed, 283 insertions(+), 2 deletions(-)

diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index 828c1ce49a8b..a7a2a2290957 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -25,6 +25,124 @@
 
 #include "netdevsim.h"
 
+struct nsim_vf_config {
+   int link_state;
+   u16 min_tx_rate;
+   u16 max_tx_rate;
+   u16 vlan;
+   __be16 vlan_proto;
+   u16 qos;
+   u8 vf_mac[ETH_ALEN];
+   bool spoofchk_enabled;
+   bool trusted;
+   bool rss_query_enabled;
+};
+
+static u32 nsim_dev_id;
+
+static int nsim_num_vf(struct device *dev)
+{
+   struct netdevsim *ns = to_nsim(dev);
+
+   return ns->num_vfs;
+}
+
+static struct bus_type nsim_bus = {
+   .name   = DRV_NAME,
+   .dev_name   = DRV_NAME,
+   .num_vf = nsim_num_vf,
+};
+
+static int nsim_vfs_enable(struct netdevsim *ns, unsigned int num_vfs)
+{
+   ns->vfconfigs = kcalloc(num_vfs, sizeof(struct nsim_vf_config),
+   GFP_KERNEL);
+   if (!ns->vfconfigs)
+   return -ENOMEM;
+   ns->num_vfs = num_vfs;
+
+   return 0;
+}
+
+static void nsim_vfs_disable(struct netdevsim *ns)
+{
+   kfree(ns->vfconfigs);
+   ns->vfconfigs = NULL;
+   ns->num_vfs = 0;
+}
+
+static ssize_t
+nsim_numvfs_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+   struct netdevsim *ns = to_nsim(dev);
+   unsigned int num_vfs;
+   int ret;
+
+   ret = kstrtouint(buf, 0, _vfs);
+   if (ret)
+   return ret;
+
+   rtnl_lock();
+   if (ns->num_vfs == num_vfs)
+   goto exit_good;
+   if (ns->num_vfs && num_vfs) {
+   ret = -EBUSY;
+   goto exit_unlock;
+   }
+
+   if (num_vfs) {
+   ret = nsim_vfs_enable(ns, num_vfs);
+   if (ret)
+   goto exit_unlock;
+   } else {
+   nsim_vfs_disable(ns);
+   }
+exit_good:
+   ret = count;
+exit_unlock:
+   rtnl_unlock();
+
+   return ret;
+}
+
+static ssize_t
+nsim_numvfs_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+   struct netdevsim *ns = to_nsim(dev);
+
+   return sprintf(buf, "%u\n", ns->num_vfs);
+}
+
+static struct device_attribute nsim_numvfs_attr =
+   __ATTR(sriov_numvfs, 0664, nsim_numvfs_show, nsim_numvfs_store);
+
+static struct attribute *nsim_dev_attrs[] = {
+   _numvfs_attr.attr,
+   NULL,
+};
+
+static const struct attribute_group nsim_dev_attr_group = {
+   .attrs = nsim_dev_attrs,
+};
+
+static const struct attribute_group *nsim_dev_attr_groups[] = {
+   _dev_attr_group,
+   NULL,
+};
+
+static void nsim_dev_release(struct device *dev)
+{
+   struct netdevsim *ns = to_nsim(dev);
+
+   free_netdev(ns->netdev);
+}
+
+struct device_type nsim_dev_type = {
+   .groups = nsim_dev_attr_groups,
+   .release = nsim_dev_release,
+};
+
 static int nsim_init(struct net_device *dev)
 {
struct netdevsim *ns = netdev_priv(dev);
@@ -37,8 +155,19 @@ static int nsim_init(struct net_device *dev)
if (err)
goto err_debugfs_destroy;
 
+   ns->dev.id = nsim_dev_id++;
+   ns->dev.bus = _bus;
+   ns->dev.type = _dev_type;
+   err = device_register(>dev);
+   if (err)
+   goto err_bpf_uninit;
+
+   SET_NETDEV_DEV(dev, >dev);
+
return 0;
 
+err_bpf_uninit:
+   nsim_bpf_uninit(ns);
 err_debugfs_destroy:
debugfs_remove_recursive(ns->ddir);
return err;
@@ -50,6 +179,14 @@ static void nsim_uninit(struct net_device *dev)
 
debugfs_remove_recursive(ns->ddir);
nsim_bpf_uninit(ns);
+   nsim_vfs_disable(ns);
+}
+
+static void nsim_free(struct net_device *dev)
+{
+   struct netdevsim *ns = netdev_priv(dev);
+
+   device_unregister(>dev);
 }
 
 static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -122,6 +259,123 @@ nsim_setup_tc_block(struct net_device *dev, struct 
tc_block_offload *f)
}
 }
 
+static int nsim_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
+{

[PATCH net-next v2 1/8] net: xdp: avoid output parameters when querying XDP prog

2017-11-30 Thread Jakub Kicinski
The output parameters will get unwieldy if we want to add more
information about the program.  Simply pass the entire
struct netdev_bpf in.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Simon Horman 
Reviewed-by: Quentin Monnet 
---
 include/linux/netdevice.h |  3 ++-
 net/core/dev.c| 24 ++--
 net/core/rtnetlink.c  |  6 +-
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ef789e1d679e..667bdd3ad33e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3330,7 +3330,8 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, 
struct net_device *dev,
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
  int fd, u32 flags);
-u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t xdp_op, u32 *prog_id);
+void __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op,
+struct netdev_bpf *xdp);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 07ed21d64f92..3f271c9cb5e0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7073,17 +7073,21 @@ int dev_change_proto_down(struct net_device *dev, bool 
proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
-u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id)
+void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
+struct netdev_bpf *xdp)
 {
-   struct netdev_bpf xdp;
-
-   memset(, 0, sizeof(xdp));
-   xdp.command = XDP_QUERY_PROG;
+   memset(xdp, 0, sizeof(*xdp));
+   xdp->command = XDP_QUERY_PROG;
 
/* Query must always succeed. */
-   WARN_ON(bpf_op(dev, ) < 0);
-   if (prog_id)
-   *prog_id = xdp.prog_id;
+   WARN_ON(bpf_op(dev, xdp) < 0);
+}
+
+static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
+{
+   struct netdev_bpf xdp;
+
+   __dev_xdp_query(dev, bpf_op, );
 
return xdp.prog_attached;
 }
@@ -7134,10 +7138,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct 
netlink_ext_ack *extack,
bpf_chk = generic_xdp_install;
 
if (fd >= 0) {
-   if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL))
+   if (bpf_chk && __dev_xdp_attached(dev, bpf_chk))
return -EEXIST;
if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
-   __dev_xdp_attached(dev, bpf_op, NULL))
+   __dev_xdp_attached(dev, bpf_op))
return -EBUSY;
 
prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dabba2a91fc8..9c4cb584bfb0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1261,6 +1261,7 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, 
u32 *prog_id)
 {
const struct net_device_ops *ops = dev->netdev_ops;
const struct bpf_prog *generic_xdp_prog;
+   struct netdev_bpf xdp;
 
ASSERT_RTNL();
 
@@ -1273,7 +1274,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, 
u32 *prog_id)
if (!ops->ndo_bpf)
return XDP_ATTACHED_NONE;
 
-   return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id);
+   __dev_xdp_query(dev, ops->ndo_bpf, );
+   *prog_id = xdp.prog_id;
+
+   return xdp.prog_attached;
 }
 
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
-- 
2.14.1



[PATCH net-next v2 3/8] net: xdp: make the stack take care of the tear down

2017-11-30 Thread Jakub Kicinski
Since day one of XDP drivers had to remember to free the program
on the remove path.  This leads to code duplication and is error
prone.  Make the stack query the installed programs on unregister
and if something is installed, remove the program.  Freeing of
program attached to XDP generic is moved from free_netdev() as well.

Because the remove will now be called before notifiers are
invoked, BPF offload state of the program will not get destroyed
before uninstall.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Simon Horman 
Reviewed-by: Quentin Monnet 
---
CC: Saeed Mahameed 
CC: Michael Chan 
CC: Ariel Elior 
CC: John Fastabend 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c  |  2 --
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  3 ---
 drivers/net/ethernet/netronome/nfp/bpf/main.c  |  7 --
 .../net/ethernet/netronome/nfp/nfp_net_common.c|  3 ---
 drivers/net/ethernet/qlogic/qede/qede_main.c   |  4 ---
 drivers/net/tun.c  |  4 ---
 net/core/dev.c | 29 --
 7 files changed, 22 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 33c49ad697e4..413ad2444ba2 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7800,8 +7800,6 @@ static void bnxt_remove_one(struct pci_dev *pdev)
bnxt_dcb_free(bp);
kfree(bp->edev);
bp->edev = NULL;
-   if (bp->xdp_prog)
-   bpf_prog_put(bp->xdp_prog);
bnxt_cleanup_pci(bp);
free_netdev(dev);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index d2b057a3e512..0f5c012de52e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4308,9 +4308,6 @@ static void mlx5e_nic_cleanup(struct mlx5e_priv *priv)
 {
mlx5e_ipsec_cleanup(priv);
mlx5e_vxlan_cleanup(priv);
-
-   if (priv->channels.params.xdp_prog)
-   bpf_prog_put(priv->channels.params.xdp_prog);
 }
 
 static int mlx5e_init_nic_rx(struct mlx5e_priv *priv)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c 
b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index e379b78e86ef..54bfd7846f6d 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -82,12 +82,6 @@ static const char *nfp_bpf_extra_cap(struct nfp_app *app, 
struct nfp_net *nn)
return nfp_net_ebpf_capable(nn) ? "BPF" : "";
 }
 
-static void nfp_bpf_vnic_free(struct nfp_app *app, struct nfp_net *nn)
-{
-   if (nn->dp.bpf_offload_xdp)
-   nfp_bpf_xdp_offload(app, nn, NULL);
-}
-
 static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
 void *type_data, void *cb_priv)
 {
@@ -168,7 +162,6 @@ const struct nfp_app_type app_bpf = {
.extra_cap  = nfp_bpf_extra_cap,
 
.vnic_alloc = nfp_app_nic_vnic_alloc,
-   .vnic_free  = nfp_bpf_vnic_free,
 
.setup_tc   = nfp_bpf_setup_tc,
.tc_busy= nfp_bpf_tc_busy,
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index ea6bbf1efefc..ad3e9f6a61e5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3562,9 +3562,6 @@ struct nfp_net *nfp_net_alloc(struct pci_dev *pdev, bool 
needs_netdev,
  */
 void nfp_net_free(struct nfp_net *nn)
 {
-   if (nn->xdp_prog)
-   bpf_prog_put(nn->xdp_prog);
-
if (nn->dp.netdev)
free_netdev(nn->dp.netdev);
else
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c 
b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 8f9b3eb82137..57332b3e5e64 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1068,10 +1068,6 @@ static void __qede_remove(struct pci_dev *pdev, enum 
qede_remove_mode mode)
 
pci_set_drvdata(pdev, NULL);
 
-   /* Release edev's reference to XDP's bpf if such exist */
-   if (edev->xdp_prog)
-   bpf_prog_put(edev->xdp_prog);
-
/* Use global ops since we've freed edev */
qed_ops->common->slowpath_stop(cdev);
if (system_state == SYSTEM_POWER_OFF)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 6a7bde9bc4b2..6f7e8e45c961 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -673,7 +673,6 @@ static void tun_detach(struct tun_file *tfile, bool clean)
 static void tun_detach_all(struct net_device *dev)
 {
struct tun_struct *tun = 

[PATCH net-next v2 4/8] netdevsim: add software driver for testing offloads

2017-11-30 Thread Jakub Kicinski
To be able to run selftests without any hardware required we
need a software model.  The model can also serve as an example
implementation for those implementing actual HW offloads.
The dummy driver have previously been extended to test SR-IOV,
but the general consensus seems to be against adding further
features to it.

Add a new driver for purposes of software modelling only.
eBPF and SR-IOV will be added here shortly, others are invited
to further extend the driver with their offload models.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Simon Horman 
Reviewed-by: Quentin Monnet 
---
 MAINTAINERS   |   5 ++
 drivers/net/Kconfig   |  11 
 drivers/net/Makefile  |   1 +
 drivers/net/netdevsim/Makefile|   6 ++
 drivers/net/netdevsim/netdev.c| 118 ++
 drivers/net/netdevsim/netdevsim.h |  26 +
 6 files changed, 167 insertions(+)
 create mode 100644 drivers/net/netdevsim/Makefile
 create mode 100644 drivers/net/netdevsim/netdev.c
 create mode 100644 drivers/net/netdevsim/netdevsim.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 77d819b458a9..010e46a38373 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9599,6 +9599,11 @@ NETWORKING [WIRELESS]
 L: linux-wirel...@vger.kernel.org
 Q: http://patchwork.kernel.org/project/linux-wireless/list/
 
+NETDEVSIM
+M: Jakub Kicinski 
+S: Maintained
+F: drivers/net/netdevsim/*
+
 NETXEN (1/10) GbE SUPPORT
 M: Manish Chopra 
 M: Rahul Verma 
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 0936da592e12..944ec3c9282c 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -497,4 +497,15 @@ config THUNDERBOLT_NET
 
 source "drivers/net/hyperv/Kconfig"
 
+config NETDEVSIM
+   tristate "Simulated networking device"
+   depends on DEBUG_FS
+   help
+ This driver is a developer testing tool and software model that can
+ be used to test various control path networking APIs, especially
+ HW-offload related.
+
+ To compile this driver as a module, choose M here: the module
+ will be called netdevsim.
+
 endif # NETDEVICES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 766f62d02a0b..04c3b747812c 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_FUJITSU_ES) += fjes/
 
 thunderbolt-net-y += thunderbolt.o
 obj-$(CONFIG_THUNDERBOLT_NET) += thunderbolt-net.o
+obj-$(CONFIG_NETDEVSIM) += netdevsim/
diff --git a/drivers/net/netdevsim/Makefile b/drivers/net/netdevsim/Makefile
new file mode 100644
index ..07867bfe873b
--- /dev/null
+++ b/drivers/net/netdevsim/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_NETDEVSIM) += netdevsim.o
+
+netdevsim-objs := \
+   netdev.o \
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
new file mode 100644
index ..7599c72c477a
--- /dev/null
+++ b/drivers/net/netdevsim/netdev.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2017 Netronome Systems, Inc.
+ *
+ * This software is licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree.
+ *
+ * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
+ * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+ * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
+ * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+ * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "netdevsim.h"
+
+static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+   struct netdevsim *ns = netdev_priv(dev);
+
+   u64_stats_update_begin(>syncp);
+   ns->tx_packets++;
+   ns->tx_bytes += skb->len;
+   u64_stats_update_end(>syncp);
+
+   dev_kfree_skb(skb);
+
+   return NETDEV_TX_OK;
+}
+
+static void nsim_set_rx_mode(struct net_device *dev)
+{
+}
+
+static void
+nsim_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+   struct netdevsim *ns = netdev_priv(dev);
+   unsigned int start;
+
+   do {
+   start = u64_stats_fetch_begin(>syncp);
+   stats->tx_bytes = ns->tx_bytes;
+   stats->tx_packets = ns->tx_packets;
+   } while (u64_stats_fetch_retry(>syncp, start));
+}
+
+static const struct net_device_ops nsim_netdev_ops = {
+   .ndo_start_xmit = nsim_start_xmit,
+   .ndo_set_rx_mode= nsim_set_rx_mode,
+   .ndo_set_mac_address= eth_mac_addr,
+   

[PATCH net-next v2 5/8] netdevsim: add bpf offload support

2017-11-30 Thread Jakub Kicinski
Add support for loading programs for netdevsim devices and
expose the related information via DebugFS.  Both offload
of XDP and cls_bpf programs is supported.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Simon Horman 
Reviewed-by: Quentin Monnet 
---
 drivers/net/netdevsim/Makefile|   1 +
 drivers/net/netdevsim/bpf.c   | 373 ++
 drivers/net/netdevsim/netdev.c| 116 +++-
 drivers/net/netdevsim/netdevsim.h |  40 
 4 files changed, 529 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/netdevsim/bpf.c

diff --git a/drivers/net/netdevsim/Makefile b/drivers/net/netdevsim/Makefile
index 07867bfe873b..074ddebbc41d 100644
--- a/drivers/net/netdevsim/Makefile
+++ b/drivers/net/netdevsim/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_NETDEVSIM) += netdevsim.o
 
 netdevsim-objs := \
netdev.o \
+   bpf.o \
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
new file mode 100644
index ..8e4398a50903
--- /dev/null
+++ b/drivers/net/netdevsim/bpf.c
@@ -0,0 +1,373 @@
+/*
+ * Copyright (C) 2017 Netronome Systems, Inc.
+ *
+ * This software is licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree.
+ *
+ * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
+ * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+ * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
+ * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+ * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "netdevsim.h"
+
+struct nsim_bpf_bound_prog {
+   struct netdevsim *ns;
+   struct bpf_prog *prog;
+   struct dentry *ddir;
+   const char *state;
+   bool is_loaded;
+   struct list_head l;
+};
+
+static int nsim_debugfs_bpf_string_read(struct seq_file *file, void *data)
+{
+   const char **str = file->private;
+
+   if (*str)
+   seq_printf(file, "%s\n", *str);
+
+   return 0;
+}
+
+static int nsim_debugfs_bpf_string_open(struct inode *inode, struct file *f)
+{
+   return single_open(f, nsim_debugfs_bpf_string_read, inode->i_private);
+}
+
+static const struct file_operations nsim_bpf_string_fops = {
+   .owner = THIS_MODULE,
+   .open = nsim_debugfs_bpf_string_open,
+   .release = single_release,
+   .read = seq_read,
+   .llseek = seq_lseek
+};
+
+static int
+nsim_bpf_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn)
+{
+   struct nsim_bpf_bound_prog *state;
+
+   state = env->prog->aux->offload->dev_priv;
+   if (state->ns->bpf_bind_verifier_delay && !insn_idx)
+   msleep(state->ns->bpf_bind_verifier_delay);
+
+   return 0;
+}
+
+static const struct bpf_ext_analyzer_ops nsim_bpf_analyzer_ops = {
+   .insn_hook = nsim_bpf_verify_insn,
+};
+
+static bool nsim_xdp_offload_active(struct netdevsim *ns)
+{
+   return ns->xdp_prog_mode == XDP_ATTACHED_HW;
+}
+
+static void nsim_prog_set_loaded(struct bpf_prog *prog, bool loaded)
+{
+   struct nsim_bpf_bound_prog *state;
+
+   if (!prog || !prog->aux->offload)
+   return;
+
+   state = prog->aux->offload->dev_priv;
+   state->is_loaded = loaded;
+}
+
+static int
+nsim_bpf_offload(struct netdevsim *ns, struct bpf_prog *prog, bool oldprog)
+{
+   nsim_prog_set_loaded(ns->bpf_offloaded, false);
+
+   WARN(!!ns->bpf_offloaded != oldprog,
+"bad offload state, expected offload %sto be active",
+oldprog ? "" : "not ");
+   ns->bpf_offloaded = prog;
+   ns->bpf_offloaded_id = prog ? prog->aux->id : 0;
+   nsim_prog_set_loaded(prog, true);
+
+   return 0;
+}
+
+int nsim_bpf_setup_tc_block_cb(enum tc_setup_type type,
+  void *type_data, void *cb_priv)
+{
+   struct tc_cls_bpf_offload *cls_bpf = type_data;
+   struct bpf_prog *prog = cls_bpf->prog;
+   struct netdevsim *ns = cb_priv;
+   bool skip_sw;
+
+   if (type != TC_SETUP_CLSBPF ||
+   !tc_can_offload(ns->netdev) ||
+   cls_bpf->common.protocol != htons(ETH_P_ALL) ||
+   cls_bpf->common.chain_index)
+   return -EOPNOTSUPP;
+
+   skip_sw = cls_bpf->gen_flags & TCA_CLS_FLAGS_SKIP_SW;
+
+   if (nsim_xdp_offload_active(ns))
+   return -EBUSY;
+
+   if (!ns->bpf_tc_accept)
+   return -EOPNOTSUPP;
+   /* Note: progs without skip_sw will probably not be dev bound */
+   if (prog && !prog->aux->offload && !ns->bpf_tc_non_bound_accept)
+   return -EOPNOTSUPP;
+
+   switch (cls_bpf->command) {
+   

Re: netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'

2017-11-30 Thread Al Viro
On Thu, Nov 30, 2017 at 04:57:30PM -0800, Kees Cook wrote:
> On Mon, Oct 9, 2017 at 4:10 PM, David Miller  wrote:
> > Shmulik Ladkani (1):
> >   netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'
> 
> This adds a new user of set_fs(), which we're trying to eliminate (or
> at least not expand):
> 
> +   set_fs(KERNEL_DS);
> +   fd = bpf_obj_get_user(path);
> +   set_fs(oldfs);
> 
> Can you please adjust this to not make set_fs() changes?

That's not the worst problem there.  Messing with descriptor table is much
worse.  It can be shared between threads; by the time you get to fdget()
the damn thing might have nothing to do with what bpf_obj_get_user() has
put there, ditto for sys_close().

Use of file descriptors should be limited to "got a number from userland,
convert to struct file *" on the way in and "install struct file * into
descriptor table and return the descriptor to userland" on the way out.
And the latter - *ONLY* after the last possible point of failure.  Once
a file reference is inserted into descriptor table, that's it - you
can't undo that.

The only way to use bpf_obj_get_user() is to pass its return value to
userland.  As return value of syscall - not even put_user() (for that
you'd need to reserve the descriptor, copy it to userland and only
then attach struct file * to it).

The whole approach stinks - what it needs is something that would
take struct filename * and return struct bpf_prog * or struct file *
reference.  With bpf_obj_get_user() and this thing implemented
via that.

I'm looking into that thing...


Re: [PATCH net-next 2/5] rhashtable: Add rhastable_walk_peek

2017-11-30 Thread Herbert Xu
On Thu, Nov 30, 2017 at 05:15:16PM -0800, Tom Herbert wrote:
>
> We don't need a guarantee of stability, but what I am seeing is that
> we're consisitently dropping entries on when doing a multi-part
> netlink walk. We start iterating over the table filling in the netlink
> info. But eventually the netlink info fills up and returns an error.
> netlink dump gets called again but now the iter of the table returns
> the object following the one that would have overflowed the netlink
> buffer. So the result I was seeing is that we dropped one object in in
> each pass.

Thanks Tom! This information is very useful.

It sounds like this problem isn't specific to ila and would exist
for all rhashtable users that dump through netlink.  Let me think
about this a little bit more.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH net-next 2/5] rhashtable: Add rhastable_walk_peek

2017-11-30 Thread Tom Herbert
On Thu, Nov 30, 2017 at 4:38 PM, Herbert Xu  wrote:
> On Thu, Nov 30, 2017 at 04:03:02PM -0800, Tom Herbert wrote:
>> This function is like rhashtable_walk_next except that it only returns
>> the current element in the inter and does not advance the iter.
>>
>> This patch also creates __rhashtable_walk_find_next. It finds the next
>> element in the table when the entry cached in iter is NULL or at the end
>> of a slot. __rhashtable_walk_find_next is called from
>> rhashtable_walk_next and rhastable_walk_peek.
>>
>> Signed-off-by: Tom Herbert 
>
> Hi Tom:
>
> Could you add some motivation for this feature into the patch
> description? As it is it's difficult to deduce why we would want
> to add something like this given that hashtable walks are always
> unstable and there is no guarantee that two calls to peek or a
> peek followed by a normal walk will see the same entry.
>
Hi Herbert,

We don't need a guarantee of stability, but what I am seeing is that
we're consisitently dropping entries on when doing a multi-part
netlink walk. We start iterating over the table filling in the netlink
info. But eventually the netlink info fills up and returns an error.
netlink dump gets called again but now the iter of the table returns
the object following the one that would have overflowed the netlink
buffer. So the result I was seeing is that we dropped one object in in
each pass.

This fixes the nldump for ila which will be in a follow on patch set.
In pseudo code it looks something like this:

rhashtable_walk_start(rhiter);

/* Get first entty */
ila = rhashtable_walk_peek(rhiter);

while (ila) {
 if (ila_dump_info(ila) < 0)
 break;
 ila = rhashtable_walk_next(rhiter);
}

rhashtable_walk_stop(rhiter);

return;

So peek is only called one and we only advance iter once the current
entry is successfully processed.

Tom



> Thanks,
> --
> Email: Herbert Xu 
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH net-next v4 2/2] net: ethernet: socionext: add AVE ethernet driver

2017-11-30 Thread Kunihiko Hayashi
The UniPhier platform from Socionext provides the AVE ethernet
controller that includes MAC and MDIO bus supporting RGMII/RMII
modes. The controller is named AVE.

Signed-off-by: Kunihiko Hayashi 
Signed-off-by: Jassi Brar 
---
 drivers/net/ethernet/Kconfig |1 +
 drivers/net/ethernet/Makefile|1 +
 drivers/net/ethernet/socionext/Kconfig   |   22 +
 drivers/net/ethernet/socionext/Makefile  |5 +
 drivers/net/ethernet/socionext/sni_ave.c | 1744 ++
 5 files changed, 1773 insertions(+)
 create mode 100644 drivers/net/ethernet/socionext/Kconfig
 create mode 100644 drivers/net/ethernet/socionext/Makefile
 create mode 100644 drivers/net/ethernet/socionext/sni_ave.c

diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig
index c604213..d50519e 100644
--- a/drivers/net/ethernet/Kconfig
+++ b/drivers/net/ethernet/Kconfig
@@ -170,6 +170,7 @@ source "drivers/net/ethernet/sis/Kconfig"
 source "drivers/net/ethernet/sfc/Kconfig"
 source "drivers/net/ethernet/sgi/Kconfig"
 source "drivers/net/ethernet/smsc/Kconfig"
+source "drivers/net/ethernet/socionext/Kconfig"
 source "drivers/net/ethernet/stmicro/Kconfig"
 source "drivers/net/ethernet/sun/Kconfig"
 source "drivers/net/ethernet/tehuti/Kconfig"
diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile
index 39f62733..6cf5ade 100644
--- a/drivers/net/ethernet/Makefile
+++ b/drivers/net/ethernet/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_SFC) += sfc/
 obj-$(CONFIG_SFC_FALCON) += sfc/falcon/
 obj-$(CONFIG_NET_VENDOR_SGI) += sgi/
 obj-$(CONFIG_NET_VENDOR_SMSC) += smsc/
+obj-$(CONFIG_NET_VENDOR_SOCIONEXT) += socionext/
 obj-$(CONFIG_NET_VENDOR_STMICRO) += stmicro/
 obj-$(CONFIG_NET_VENDOR_SUN) += sun/
 obj-$(CONFIG_NET_VENDOR_TEHUTI) += tehuti/
diff --git a/drivers/net/ethernet/socionext/Kconfig 
b/drivers/net/ethernet/socionext/Kconfig
new file mode 100644
index 000..3a1829e
--- /dev/null
+++ b/drivers/net/ethernet/socionext/Kconfig
@@ -0,0 +1,22 @@
+config NET_VENDOR_SOCIONEXT
+   bool "Socionext ethernet drivers"
+   default y
+   ---help---
+ Option to select ethernet drivers for Socionext platforms.
+
+ Note that the answer to this question doesn't directly affect the
+ kernel: saying N will just cause the configurator to skip all
+ the questions about Socionext devices. If you say Y, you will be asked
+ for your specific card in the following questions.
+
+if NET_VENDOR_SOCIONEXT
+
+config SNI_AVE
+   tristate "Socionext AVE ethernet support"
+   depends on (ARCH_UNIPHIER || COMPILE_TEST) && OF
+   select PHYLIB
+   ---help---
+ Driver for gigabit ethernet MACs, called AVE, in the
+ Socionext UniPhier family.
+
+endif #NET_VENDOR_SOCIONEXT
diff --git a/drivers/net/ethernet/socionext/Makefile 
b/drivers/net/ethernet/socionext/Makefile
new file mode 100644
index 000..ab83df6
--- /dev/null
+++ b/drivers/net/ethernet/socionext/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for all ethernet ip drivers on Socionext platforms
+#
+obj-$(CONFIG_SNI_AVE) += sni_ave.o
diff --git a/drivers/net/ethernet/socionext/sni_ave.c 
b/drivers/net/ethernet/socionext/sni_ave.c
new file mode 100644
index 000..6f42f12
--- /dev/null
+++ b/drivers/net/ethernet/socionext/sni_ave.c
@@ -0,0 +1,1744 @@
+/**
+ * sni_ave.c - Socionext UniPhier AVE ethernet driver
+ *
+ * Copyright 2014 Panasonic Corporation
+ * Copyright 2015-2017 Socionext Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2  of
+ * the License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* General Register Group */
+#define AVE_IDR0x000   /* ID */
+#define AVE_VR 0x004   /* Version */
+#define AVE_GRR0x008   /* Global Reset */
+#define AVE_CFGR   0x00c   /* Configuration */
+
+/* Interrupt Register Group */
+#define AVE_GIMR   0x100   /* Global Interrupt Mask */
+#define AVE_GISR   0x104   /* Global Interrupt Status */
+
+/* MAC Register Group */
+#define AVE_TXCR   0x200   /* TX Setup */
+#define AVE_RXCR   0x204   /* RX Setup */
+#define AVE_RXMAC1R0x208   /* MAC address (lower) */
+#define AVE_RXMAC2R0x20c   /* MAC address (upper) */
+#define AVE_MDIOCTR0x214   /* MDIO 

[PATCH net-next v4 1/2] dt-bindings: net: add DT bindings for Socionext UniPhier AVE

2017-11-30 Thread Kunihiko Hayashi
DT bindings for the AVE ethernet controller found on Socionext's
UniPhier platforms.

Signed-off-by: Kunihiko Hayashi 
Signed-off-by: Jassi Brar 
Acked-by: Rob Herring 
---
 .../bindings/net/socionext,uniphier-ave4.txt   | 48 ++
 1 file changed, 48 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt

diff --git a/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt 
b/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt
new file mode 100644
index 000..4700377
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt
@@ -0,0 +1,48 @@
+* Socionext AVE ethernet controller
+
+This describes the devicetree bindings for AVE ethernet controller
+implemented on Socionext UniPhier SoCs.
+
+Required properties:
+ - compatible: Should be
+   - "socionext,uniphier-pro4-ave4" : for Pro4 SoC
+   - "socionext,uniphier-pxs2-ave4" : for PXs2 SoC
+   - "socionext,uniphier-ld11-ave4" : for LD11 SoC
+   - "socionext,uniphier-ld20-ave4" : for LD20 SoC
+ - reg: Address where registers are mapped and size of region.
+ - interrupts: Should contain the MAC interrupt.
+ - phy-mode: See ethernet.txt in the same directory. Allow to choose
+   "rgmii", "rmii", or "mii" according to the PHY.
+ - phy-handle: Should point to the external phy device.
+   See ethernet.txt file in the same directory.
+ - clocks: A phandle to the clock for the MAC.
+
+Optional properties:
+ - resets: A phandle to the reset control for the MAC
+ - local-mac-address: See ethernet.txt in the same directory.
+
+Required subnode:
+ - mdio: Device tree subnode with the following required properties:
+   - #address-cells: Must be <1>.
+   - #size-cells: Must be <0>.
+   - reg: phy ID number, usually a small integer.
+
+Example:
+
+   ether: ethernet@6500 {
+   compatible = "socionext,uniphier-ld20-ave4";
+   reg = <0x6500 0x8500>;
+   interrupts = <0 66 4>;
+   phy-mode = "rgmii";
+   phy-handle = <>;
+   clocks = <_clk 6>;
+   resets = <_rst 6>;
+   local-mac-address = [00 00 00 00 00 00];
+   mdio {
+   #address-cells = <1>;
+   #size-cells = <0>;
+   ethphy: ethphy@1 {
+   reg = <1>;
+   };
+   };
+   };
-- 
2.7.4



[PATCH net-next v4 0/2] add UniPhier AVE ethernet support

2017-11-30 Thread Kunihiko Hayashi
This series adds support for Socionext AVE ethernet controller implemented
on UniPhier SoCs. This driver supports RGMII/RMII modes.

v3: http://www.spinics.net/lists/netdev/msg462550.html

The PHY patch included in v1 has already separated in:
http://www.spinics.net/lists/netdev/msg454595.html

Changes since v3:
- remove checking dma address and use dma_set_mask() to restirct address
- replace ave_mdio_busywait() with read_poll_timeout()
- replace functions to access to registers with readl/writel() directly
- replace a function to access to macaddr with ave_hw_write_macaddr()
- change return value of ave_dma_map() to error value
- move mdiobus_unregister() from ave_remove() to ave_uninit()
- eliminate else block at the end of ave_dma_map()
- add mask definitions for packet filter
- sort bitmap definitions in descending order
- add error check to some functions
- rename and sort functions to clear sub-categories
- fix error value consistency
- remove unneeded initializers
- change type of constant arrays

Changes since v2:
- replace clk_get() with devm_clk_get()
- replace reset_control_get() with devm_reset_control_get_optional_shared()
- add error return when the error occurs on the above *_get functions
- sort soc data and compatible strings
- remove clearly obvious comments
- modify dt-bindings document consistent with these modifications

Changes since v1:
- add/remove devicetree properties and sub-node
  - remove "internal-phy-interrupt" and "desc-bits" property
  - add SoC data structures based on compatible strings
  - add node operation to apply "mdio" sub-node
- add support for features
  - add support for {get,set}_pauseparam and pause frame operations
  - add support for ndo_get_stats64 instead of ndo_get_stats
- replace with desiable functions
  - replace check for valid phy_mode with phy_interface{_mode}_is_rgmii()
  - replace phy attach message with phy_attached_info()
  - replace 32bit operation with {upper,lower}_32_bits() on ave_wdesc_addr()
  - replace nway_reset and get_link with generic functions
- move operations to proper functions
  - move phy_start_aneg() to ndo_open,
and remove unnecessary PHY interrupt operations
See http://www.spinics.net/lists/netdev/msg454590.html
  - move irq initialization and descriptor memory allocation to ndo_open
  - move initialization of reset and clock and mdiobus to ndo_init
- fix skbuffer operations
  - fix skb alignment operations and add Rx buffer adjustment for descriptor
See http://www.spinics.net/lists/netdev/msg456014.html
  - add error returns when dma_map_single() failed 
- clean up code structures
  - clean up wait-loop and wake-queue conditions
  - add ave_wdesc_addr() and offset definitions
  - add ave_macaddr_init() to clean up mac-address operation
  - fix checking whether Tx entry is not enough
  - fix supported features of phydev
  - add necessary free/disable operations
  - add phydev check on ave_{get,set}_wol()
  - remove netif_carrier functions, phydev initializer, and Tx budget check
- change obsolate codes
  - replace ndev->{base_addr,irq} with the members of ave_private
- rename goto labels and mask definitions, and remove unused codes

Kunihiko Hayashi (2):
  dt-bindings: net: add DT bindings for Socionext UniPhier AVE
  net: ethernet: socionext: add AVE ethernet driver

 .../bindings/net/socionext,uniphier-ave4.txt   |   48 +
 drivers/net/ethernet/Kconfig   |1 +
 drivers/net/ethernet/Makefile  |1 +
 drivers/net/ethernet/socionext/Kconfig |   22 +
 drivers/net/ethernet/socionext/Makefile|5 +
 drivers/net/ethernet/socionext/sni_ave.c   | 1744 
 6 files changed, 1821 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt
 create mode 100644 drivers/net/ethernet/socionext/Kconfig
 create mode 100644 drivers/net/ethernet/socionext/Makefile
 create mode 100644 drivers/net/ethernet/socionext/sni_ave.c

-- 
2.7.4



Re: [Patch net-next] act_mirred: get rid of mirred_list_lock spinlock

2017-11-30 Thread Cong Wang
On Thu, Nov 30, 2017 at 3:12 PM, Eric Dumazet  wrote:
> On Thu, 2017-11-30 at 14:53 -0800, Cong Wang wrote:
>> @@ -55,13 +54,10 @@ static void tcf_mirred_release(struct tc_action
>> *a, int bind)
>>   struct tcf_mirred *m = to_mirred(a);
>>   struct net_device *dev;
>>
>> - /* We could be called either in a RCU callback or with RTNL
>> lock held. */
>> - spin_lock_bh(_list_lock);
>>   list_del(>tcfm_list);
>>   dev = rcu_dereference_protected(m->tcfm_dev, 1);
>
> If RTNL is held at this point, I suggest to use
> rtnl_dereference() instead of rcu_dereference_protected() to get proper
> lockdep coverage.

Ah, sure, I missed it. Will send v2 later.


Re: netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'

2017-11-30 Thread Kees Cook
On Mon, Oct 9, 2017 at 4:10 PM, David Miller  wrote:
> Shmulik Ladkani (1):
>   netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'

This adds a new user of set_fs(), which we're trying to eliminate (or
at least not expand):

+   set_fs(KERNEL_DS);
+   fd = bpf_obj_get_user(path);
+   set_fs(oldfs);

Can you please adjust this to not make set_fs() changes?

Thanks!

-Kees

-- 
Kees Cook
Pixel Security


[PATCH] netfilter: add overflow checks in xt_bpf.c

2017-11-30 Thread Jann Horn
Check whether inputs from userspace are too long (explicit length field too
big or string not null-terminated) to avoid out-of-bounds reads.

As far as I can tell, this can at worst lead to very limited kernel heap
memory disclosure or oopses.

This bug can be triggered by an unprivileged user even if the xt_bpf module
is not loaded: iptables is available in network namespaces, and the xt_bpf
module can be autoloaded.

Triggering the bug with a classic BPF filter with fake length 0x1000 causes
the following KASAN report:

==
BUG: KASAN: slab-out-of-bounds in bpf_prog_create+0x84/0xf0
Read of size 32768 at addr 8801eff2c494 by task test/4627

CPU: 0 PID: 4627 Comm: test Not tainted 4.15.0-rc1+ #1
[...]
Call Trace:
 dump_stack+0x5c/0x85
 print_address_description+0x6a/0x260
 kasan_report+0x254/0x370
 ? bpf_prog_create+0x84/0xf0
 memcpy+0x1f/0x50
 bpf_prog_create+0x84/0xf0
 bpf_mt_check+0x90/0xd6 [xt_bpf]
[...]
Allocated by task 4627:
 kasan_kmalloc+0xa0/0xd0
 __kmalloc_node+0x47/0x60
 xt_alloc_table_info+0x41/0x70 [x_tables]
[...]
The buggy address belongs to the object at 8801eff2c3c0
which belongs to the cache kmalloc-2048 of size 2048
The buggy address is located 212 bytes inside of
2048-byte region [8801eff2c3c0, 8801eff2cbc0)
[...]
==

Fixes: e6f30c731718 ("netfilter: x_tables: add xt_bpf match")
Signed-off-by: Jann Horn 
---
 net/netfilter/xt_bpf.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 041da0d9c06f..1f7fbd3c7e5a 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -27,6 +27,9 @@ static int __bpf_mt_check_bytecode(struct sock_filter *insns, 
__u16 len,
 {
struct sock_fprog_kern program;
 
+   if (len > XT_BPF_MAX_NUM_INSTR)
+   return -EINVAL;
+
program.len = len;
program.filter = insns;
 
@@ -55,6 +58,9 @@ static int __bpf_mt_check_path(const char *path, struct 
bpf_prog **ret)
mm_segment_t oldfs = get_fs();
int retval, fd;
 
+   if (strnlen(path, XT_BPF_PATH_MAX) == XT_BPF_PATH_MAX)
+   return -EINVAL;
+
set_fs(KERNEL_DS);
fd = bpf_obj_get_user(path, 0);
set_fs(oldfs);
-- 
2.15.0.531.g2ccb3012c9-goog



Re: [PATCH net-next 2/5] rhashtable: Add rhastable_walk_peek

2017-11-30 Thread Herbert Xu
On Thu, Nov 30, 2017 at 04:03:02PM -0800, Tom Herbert wrote:
> This function is like rhashtable_walk_next except that it only returns
> the current element in the inter and does not advance the iter.
> 
> This patch also creates __rhashtable_walk_find_next. It finds the next
> element in the table when the entry cached in iter is NULL or at the end
> of a slot. __rhashtable_walk_find_next is called from
> rhashtable_walk_next and rhastable_walk_peek.
> 
> Signed-off-by: Tom Herbert 

Hi Tom:

Could you add some motivation for this feature into the patch
description? As it is it's difficult to deduce why we would want
to add something like this given that hashtable walks are always
unstable and there is no guarantee that two calls to peek or a
peek followed by a normal walk will see the same entry.

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH net-next 00/11] net: ethernet: ti: cpsw/ale clean up and optimization

2017-11-30 Thread Grygorii Strashko
This is set of non critical clean ups and optimizations for TI
CPSW and ALE drivers.

Rebased on top on net-next.

Grygorii Strashko (11):
  net: ethernet: ti: cpsw: drop unused var poll from
cpsw_update_channels_res
  net: ethernet: ti: cpsw: use proper io apis
  net: ethernet: ti: cpsw: move platform data struct to .c file
  net: ethernet: ti: cpsw: move mac_hi/lo defines in cpsw.h
  net: ethernet: ti: cpsw: fix ale port numbers
  net: ethernet: ti: ale: use proper io apis
  net: ethernet: ti: ale: disable ale from stop()
  net: ethernet: ti: ale: optimize ale entry mask bits configuartion
  net: ethernet: ti: ale: move static initialization in
cpsw_ale_create()
  net: ethernet: ti: ale: use devm_kzalloc in cpsw_ale_create()
  net: ethernet: ti: ale: fix port check in cpsw_ale_control_set/get

 drivers/net/ethernet/ti/cpsw.c|  84 +++---
 drivers/net/ethernet/ti/cpsw.h|  23 +--
 drivers/net/ethernet/ti/cpsw_ale.c| 109 ++
 drivers/net/ethernet/ti/cpsw_ale.h|   1 -
 drivers/net/ethernet/ti/netcp_ethss.c |   6 +-
 5 files changed, 98 insertions(+), 125 deletions(-)

-- 
2.10.5



[PATCH net-next 04/11] net: ethernet: ti: cpsw: move mac_hi/lo defines in cpsw.h

2017-11-30 Thread Grygorii Strashko
Move mac_hi/lo defines in common header cpsw.h and re-use
them for netcp_ethss.c.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c| 4 
 drivers/net/ethernet/ti/cpsw.h| 4 
 drivers/net/ethernet/ti/netcp_ethss.c | 5 +
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 54c8758..2bf0bda 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -978,10 +978,6 @@ static inline void soft_reset(const char *module, void 
__iomem *reg)
WARN(readl_relaxed(reg) & 1, "failed to soft-reset %s\n", module);
 }
 
-#define mac_hi(mac)(((mac)[0] << 0) | ((mac)[1] << 8) |\
-((mac)[2] << 16) | ((mac)[3] << 24))
-#define mac_lo(mac)(((mac)[4] << 0) | ((mac)[5] << 8))
-
 static void cpsw_set_slave_mac(struct cpsw_slave *slave,
   struct cpsw_priv *priv)
 {
diff --git a/drivers/net/ethernet/ti/cpsw.h b/drivers/net/ethernet/ti/cpsw.h
index a325f555..cf111db 100644
--- a/drivers/net/ethernet/ti/cpsw.h
+++ b/drivers/net/ethernet/ti/cpsw.h
@@ -17,6 +17,10 @@
 #include 
 #include 
 
+#define mac_hi(mac)(((mac)[0] << 0) | ((mac)[1] << 8) |\
+((mac)[2] << 16) | ((mac)[3] << 24))
+#define mac_lo(mac)(((mac)[4] << 0) | ((mac)[5] << 8))
+
 void cpsw_phy_sel(struct device *dev, phy_interface_t phy_mode, int slave);
 int ti_cm_get_macid(struct device *dev, int slave, u8 *mac_addr);
 
diff --git a/drivers/net/ethernet/ti/netcp_ethss.c 
b/drivers/net/ethernet/ti/netcp_ethss.c
index e831c49..12765e4 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 
+#include "cpsw.h"
 #include "cpsw_ale.h"
 #include "netcp.h"
 #include "cpts.h"
@@ -2047,10 +2048,6 @@ static const struct ethtool_ops keystone_ethtool_ops = {
.get_ts_info= keystone_get_ts_info,
 };
 
-#define mac_hi(mac)(((mac)[0] << 0) | ((mac)[1] << 8) |\
-((mac)[2] << 16) | ((mac)[3] << 24))
-#define mac_lo(mac)(((mac)[4] << 0) | ((mac)[5] << 8))
-
 static void gbe_set_slave_mac(struct gbe_slave *slave,
  struct gbe_intf *gbe_intf)
 {
-- 
2.10.5



[PATCH net-next 03/11] net: ethernet: ti: cpsw: move platform data struct to .c file

2017-11-30 Thread Grygorii Strashko
CPSW platform data struct cpsw_platform_data and struct cpsw_slave_data are
used only incide cpsw.c module, so move these definitions there.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c | 21 +
 drivers/net/ethernet/ti/cpsw.h | 21 -
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index f74a8fd..54c8758 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -352,6 +352,27 @@ struct cpsw_hw_stats {
u32 rxdmaoverruns;
 };
 
+struct cpsw_slave_data {
+   struct device_node *phy_node;
+   charphy_id[MII_BUS_ID_SIZE];
+   int phy_if;
+   u8  mac_addr[ETH_ALEN];
+   u16 dual_emac_res_vlan; /* Reserved VLAN for DualEMAC */
+};
+
+struct cpsw_platform_data {
+   struct cpsw_slave_data  *slave_data;
+   u32 ss_reg_ofs; /* Subsystem control register offset */
+   u32 channels;   /* number of cpdma channels (symmetric) */
+   u32 slaves; /* number of slave cpgmac ports */
+   u32 active_slave; /* time stamping, ethtool and SIOCGMIIPHY slave */
+   u32 ale_entries;/* ale table size */
+   u32 bd_ram_size;  /*buffer descriptor ram size */
+   u32 mac_control;/* Mac control register */
+   u16 default_vlan;   /* Def VLAN for ALE lookup in VLAN aware mode*/
+   booldual_emac;  /* Enable Dual EMAC mode */
+};
+
 struct cpsw_slave {
void __iomem*regs;
struct cpsw_sliver_regs __iomem *sliver;
diff --git a/drivers/net/ethernet/ti/cpsw.h b/drivers/net/ethernet/ti/cpsw.h
index 6c3037a..a325f555 100644
--- a/drivers/net/ethernet/ti/cpsw.h
+++ b/drivers/net/ethernet/ti/cpsw.h
@@ -17,27 +17,6 @@
 #include 
 #include 
 
-struct cpsw_slave_data {
-   struct device_node *phy_node;
-   charphy_id[MII_BUS_ID_SIZE];
-   int phy_if;
-   u8  mac_addr[ETH_ALEN];
-   u16 dual_emac_res_vlan; /* Reserved VLAN for DualEMAC */
-};
-
-struct cpsw_platform_data {
-   struct cpsw_slave_data  *slave_data;
-   u32 ss_reg_ofs; /* Subsystem control register offset */
-   u32 channels;   /* number of cpdma channels (symmetric) */
-   u32 slaves; /* number of slave cpgmac ports */
-   u32 active_slave; /* time stamping, ethtool and SIOCGMIIPHY slave */
-   u32 ale_entries;/* ale table size */
-   u32 bd_ram_size;  /*buffer descriptor ram size */
-   u32 mac_control;/* Mac control register */
-   u16 default_vlan;   /* Def VLAN for ALE lookup in VLAN aware mode*/
-   booldual_emac;  /* Enable Dual EMAC mode */
-};
-
 void cpsw_phy_sel(struct device *dev, phy_interface_t phy_mode, int slave);
 int ti_cm_get_macid(struct device *dev, int slave, u8 *mac_addr);
 
-- 
2.10.5



[PATCH net-next 02/11] net: ethernet: ti: cpsw: use proper io apis

2017-11-30 Thread Grygorii Strashko
Switch to use writel_relaxed/readl_relaxed() IO API instead of raw version
as it is recommended.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c | 36 ++--
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 523c110..f74a8fd 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -365,12 +365,12 @@ struct cpsw_slave {
 
 static inline u32 slave_read(struct cpsw_slave *slave, u32 offset)
 {
-   return __raw_readl(slave->regs + offset);
+   return readl_relaxed(slave->regs + offset);
 }
 
 static inline void slave_write(struct cpsw_slave *slave, u32 val, u32 offset)
 {
-   __raw_writel(val, slave->regs + offset);
+   writel_relaxed(val, slave->regs + offset);
 }
 
 struct cpsw_vector {
@@ -660,8 +660,8 @@ static void cpsw_ndo_set_rx_mode(struct net_device *ndev)
 
 static void cpsw_intr_enable(struct cpsw_common *cpsw)
 {
-   __raw_writel(0xFF, >wr_regs->tx_en);
-   __raw_writel(0xFF, >wr_regs->rx_en);
+   writel_relaxed(0xFF, >wr_regs->tx_en);
+   writel_relaxed(0xFF, >wr_regs->rx_en);
 
cpdma_ctlr_int_ctrl(cpsw->dma, true);
return;
@@ -669,8 +669,8 @@ static void cpsw_intr_enable(struct cpsw_common *cpsw)
 
 static void cpsw_intr_disable(struct cpsw_common *cpsw)
 {
-   __raw_writel(0, >wr_regs->tx_en);
-   __raw_writel(0, >wr_regs->rx_en);
+   writel_relaxed(0, >wr_regs->tx_en);
+   writel_relaxed(0, >wr_regs->rx_en);
 
cpdma_ctlr_int_ctrl(cpsw->dma, false);
return;
@@ -949,12 +949,12 @@ static inline void soft_reset(const char *module, void 
__iomem *reg)
 {
unsigned long timeout = jiffies + HZ;
 
-   __raw_writel(1, reg);
+   writel_relaxed(1, reg);
do {
cpu_relax();
-   } while ((__raw_readl(reg) & 1) && time_after(timeout, jiffies));
+   } while ((readl_relaxed(reg) & 1) && time_after(timeout, jiffies));
 
-   WARN(__raw_readl(reg) & 1, "failed to soft-reset %s\n", module);
+   WARN(readl_relaxed(reg) & 1, "failed to soft-reset %s\n", module);
 }
 
 #define mac_hi(mac)(((mac)[0] << 0) | ((mac)[1] << 8) |\
@@ -1015,7 +1015,7 @@ static void _cpsw_adjust_link(struct cpsw_slave *slave,
 
if (mac_control != slave->mac_control) {
phy_print_status(phy);
-   __raw_writel(mac_control, >sliver->mac_control);
+   writel_relaxed(mac_control, >sliver->mac_control);
}
 
slave->mac_control = mac_control;
@@ -1278,7 +1278,7 @@ static void cpsw_slave_open(struct cpsw_slave *slave, 
struct cpsw_priv *priv)
soft_reset_slave(slave);
 
/* setup priority mapping */
-   __raw_writel(RX_PRIORITY_MAPPING, >sliver->rx_pri_map);
+   writel_relaxed(RX_PRIORITY_MAPPING, >sliver->rx_pri_map);
 
switch (cpsw->version) {
case CPSW_VERSION_1:
@@ -1304,7 +1304,7 @@ static void cpsw_slave_open(struct cpsw_slave *slave, 
struct cpsw_priv *priv)
}
 
/* setup max packet size, and mac address */
-   __raw_writel(cpsw->rx_packet_max, >sliver->rx_maxlen);
+   writel_relaxed(cpsw->rx_packet_max, >sliver->rx_maxlen);
cpsw_set_slave_mac(slave, priv);
 
slave->mac_control = 0; /* no link yet */
@@ -1395,9 +1395,9 @@ static void cpsw_init_host_port(struct cpsw_priv *priv)
writel(fifo_mode, >host_port_regs->tx_in_ctl);
 
/* setup host port priority mapping */
-   __raw_writel(CPDMA_TX_PRIORITY_MAP,
->host_port_regs->cpdma_tx_pri_map);
-   __raw_writel(0, >host_port_regs->cpdma_rx_chan_map);
+   writel_relaxed(CPDMA_TX_PRIORITY_MAP,
+  >host_port_regs->cpdma_tx_pri_map);
+   writel_relaxed(0, >host_port_regs->cpdma_rx_chan_map);
 
cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM,
 ALE_PORT_STATE, ALE_PORT_STATE_FORWARD);
@@ -1514,10 +1514,10 @@ static int cpsw_ndo_open(struct net_device *ndev)
/* initialize shared resources for every ndev */
if (!cpsw->usage_count) {
/* disable priority elevation */
-   __raw_writel(0, >regs->ptype);
+   writel_relaxed(0, >regs->ptype);
 
/* enable statistics collection only on all ports */
-   __raw_writel(0x7, >regs->stat_port_en);
+   writel_relaxed(0x7, >regs->stat_port_en);
 
/* Enable internal fifo flow control */
writel(0x7, >regs->flow_control);
@@ -1701,7 +1701,7 @@ static void cpsw_hwtstamp_v2(struct cpsw_priv *priv)
 
slave_write(slave, mtype, CPSW2_TS_SEQ_MTYPE);
slave_write(slave, ctrl, CPSW2_CONTROL);
-   __raw_writel(ETH_P_1588, >regs->ts_ltype);
+   writel_relaxed(ETH_P_1588, >regs->ts_ltype);
 }
 
 static int cpsw_hwtstamp_set(struct net_device *dev, struct ifreq *ifr)
-- 

[PATCH net-next 06/11] net: ethernet: ti: ale: use proper io apis

2017-11-30 Thread Grygorii Strashko
Switch to use writel_relaxed/readl_relaxed() IO API instead of raw version
as it is recommended.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw_ale.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw_ale.c 
b/drivers/net/ethernet/ti/cpsw_ale.c
index b432a75..fca5c29 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -150,11 +150,11 @@ static int cpsw_ale_read(struct cpsw_ale *ale, int idx, 
u32 *ale_entry)
 
WARN_ON(idx > ale->params.ale_entries);
 
-   __raw_writel(idx, ale->params.ale_regs + ALE_TABLE_CONTROL);
+   writel_relaxed(idx, ale->params.ale_regs + ALE_TABLE_CONTROL);
 
for (i = 0; i < ALE_ENTRY_WORDS; i++)
-   ale_entry[i] = __raw_readl(ale->params.ale_regs +
-  ALE_TABLE + 4 * i);
+   ale_entry[i] = readl_relaxed(ale->params.ale_regs +
+ALE_TABLE + 4 * i);
 
return idx;
 }
@@ -166,11 +166,11 @@ static int cpsw_ale_write(struct cpsw_ale *ale, int idx, 
u32 *ale_entry)
WARN_ON(idx > ale->params.ale_entries);
 
for (i = 0; i < ALE_ENTRY_WORDS; i++)
-   __raw_writel(ale_entry[i], ale->params.ale_regs +
-ALE_TABLE + 4 * i);
+   writel_relaxed(ale_entry[i], ale->params.ale_regs +
+  ALE_TABLE + 4 * i);
 
-   __raw_writel(idx | ALE_TABLE_WRITE, ale->params.ale_regs +
-ALE_TABLE_CONTROL);
+   writel_relaxed(idx | ALE_TABLE_WRITE, ale->params.ale_regs +
+  ALE_TABLE_CONTROL);
 
return idx;
 }
@@ -733,9 +733,9 @@ int cpsw_ale_control_set(struct cpsw_ale *ale, int port, 
int control,
offset = info->offset + (port * info->port_offset);
shift  = info->shift  + (port * info->port_shift);
 
-   tmp = __raw_readl(ale->params.ale_regs + offset);
+   tmp = readl_relaxed(ale->params.ale_regs + offset);
tmp = (tmp & ~(mask << shift)) | (value << shift);
-   __raw_writel(tmp, ale->params.ale_regs + offset);
+   writel_relaxed(tmp, ale->params.ale_regs + offset);
 
return 0;
 }
@@ -760,7 +760,7 @@ int cpsw_ale_control_get(struct cpsw_ale *ale, int port, 
int control)
offset = info->offset + (port * info->port_offset);
shift  = info->shift  + (port * info->port_shift);
 
-   tmp = __raw_readl(ale->params.ale_regs + offset) >> shift;
+   tmp = readl_relaxed(ale->params.ale_regs + offset) >> shift;
return tmp & BITMASK(info->bits);
 }
 EXPORT_SYMBOL_GPL(cpsw_ale_control_get);
@@ -781,7 +781,7 @@ void cpsw_ale_start(struct cpsw_ale *ale)
 {
u32 rev, ale_entries;
 
-   rev = __raw_readl(ale->params.ale_regs + ALE_IDVER);
+   rev = readl_relaxed(ale->params.ale_regs + ALE_IDVER);
if (!ale->params.major_ver_mask)
ale->params.major_ver_mask = 0xff;
ale->version =
@@ -793,8 +793,8 @@ void cpsw_ale_start(struct cpsw_ale *ale)
 
if (!ale->params.ale_entries) {
ale_entries =
-   __raw_readl(ale->params.ale_regs + ALE_STATUS) &
-   ALE_STATUS_SIZE_MASK;
+   readl_relaxed(ale->params.ale_regs + ALE_STATUS) &
+   ALE_STATUS_SIZE_MASK;
/* ALE available on newer NetCP switches has introduced
 * a register, ALE_STATUS, to indicate the size of ALE
 * table which shows the size as a multiple of 1024 entries.
-- 
2.10.5



[PATCH net-next 05/11] net: ethernet: ti: cpsw: fix ale port numbers

2017-11-30 Thread Grygorii Strashko
TI OMAP/Sitara SoCs have fixed number of ALE ports 3, which includes Host
port also.

Hence, use fixed value instead of value calcualted from DT, which can be
set by user and might not reflect actual HW configuration.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 2bf0bda..64bdd92 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -88,6 +88,7 @@ do {  
\
 #define CPSW_VERSION_4 0x190112
 
 #define HOST_PORT_NUM  0
+#define CPSW_ALE_PORTS_NUM 3
 #define SLIVER_SIZE0x40
 
 #define CPSW1_HOST_PORT_OFFSET 0x028
@@ -3074,7 +3075,7 @@ static int cpsw_probe(struct platform_device *pdev)
ale_params.dev  = >dev;
ale_params.ale_ageout   = ale_ageout;
ale_params.ale_entries  = data->ale_entries;
-   ale_params.ale_ports= data->slaves;
+   ale_params.ale_ports= CPSW_ALE_PORTS_NUM;
 
cpsw->ale = cpsw_ale_create(_params);
if (!cpsw->ale) {
-- 
2.10.5



[PATCH net-next 10/11] net: ethernet: ti: ale: use devm_kzalloc in cpsw_ale_create()

2017-11-30 Thread Grygorii Strashko
Use cpsw_ale_create in cpsw_ale_create(). This also makes
cpsw_ale_destroy() function nop, so remove it.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c| 17 +++--
 drivers/net/ethernet/ti/cpsw_ale.c| 11 +--
 drivers/net/ethernet/ti/cpsw_ale.h|  1 -
 drivers/net/ethernet/ti/netcp_ethss.c |  1 -
 4 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 64bdd92..a60a378 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -3087,14 +3087,14 @@ static int cpsw_probe(struct platform_device *pdev)
cpsw->cpts = cpts_create(cpsw->dev, cpts_regs, cpsw->dev->of_node);
if (IS_ERR(cpsw->cpts)) {
ret = PTR_ERR(cpsw->cpts);
-   goto clean_ale_ret;
+   goto clean_dma_ret;
}
 
ndev->irq = platform_get_irq(pdev, 1);
if (ndev->irq < 0) {
dev_err(priv->dev, "error getting irq resource\n");
ret = ndev->irq;
-   goto clean_ale_ret;
+   goto clean_dma_ret;
}
 
of_id = of_match_device(cpsw_of_mtable, >dev);
@@ -3118,7 +3118,7 @@ static int cpsw_probe(struct platform_device *pdev)
if (ret) {
dev_err(priv->dev, "error registering net device\n");
ret = -ENODEV;
-   goto clean_ale_ret;
+   goto clean_dma_ret;
}
 
if (cpsw->data.dual_emac) {
@@ -3141,7 +3141,7 @@ static int cpsw_probe(struct platform_device *pdev)
irq = platform_get_irq(pdev, 1);
if (irq < 0) {
ret = irq;
-   goto clean_ale_ret;
+   goto clean_dma_ret;
}
 
cpsw->irqs_table[0] = irq;
@@ -3149,14 +3149,14 @@ static int cpsw_probe(struct platform_device *pdev)
   0, dev_name(>dev), cpsw);
if (ret < 0) {
dev_err(priv->dev, "error attaching irq (%d)\n", ret);
-   goto clean_ale_ret;
+   goto clean_dma_ret;
}
 
/* TX IRQ */
irq = platform_get_irq(pdev, 2);
if (irq < 0) {
ret = irq;
-   goto clean_ale_ret;
+   goto clean_dma_ret;
}
 
cpsw->irqs_table[1] = irq;
@@ -3164,7 +3164,7 @@ static int cpsw_probe(struct platform_device *pdev)
   0, dev_name(>dev), cpsw);
if (ret < 0) {
dev_err(priv->dev, "error attaching irq (%d)\n", ret);
-   goto clean_ale_ret;
+   goto clean_dma_ret;
}
 
cpsw_notice(priv, probe,
@@ -3177,8 +3177,6 @@ static int cpsw_probe(struct platform_device *pdev)
 
 clean_unregister_netdev_ret:
unregister_netdev(ndev);
-clean_ale_ret:
-   cpsw_ale_destroy(cpsw->ale);
 clean_dma_ret:
cpdma_ctlr_destroy(cpsw->dma);
 clean_dt_ret:
@@ -3208,7 +3206,6 @@ static int cpsw_remove(struct platform_device *pdev)
unregister_netdev(ndev);
 
cpts_release(cpsw->cpts);
-   cpsw_ale_destroy(cpsw->ale);
cpdma_ctlr_destroy(cpsw->dma);
cpsw_remove_dt(pdev);
pm_runtime_put_sync(>dev);
diff --git a/drivers/net/ethernet/ti/cpsw_ale.c 
b/drivers/net/ethernet/ti/cpsw_ale.c
index 53aa721..e513248 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -802,7 +802,7 @@ struct cpsw_ale *cpsw_ale_create(struct cpsw_ale_params 
*params)
struct cpsw_ale *ale;
u32 rev, ale_entries;
 
-   ale = kzalloc(sizeof(*ale), GFP_KERNEL);
+   ale = devm_kzalloc(params->dev, sizeof(*ale), GFP_KERNEL);
if (!ale)
return NULL;
 
@@ -881,15 +881,6 @@ struct cpsw_ale *cpsw_ale_create(struct cpsw_ale_params 
*params)
 }
 EXPORT_SYMBOL_GPL(cpsw_ale_create);
 
-int cpsw_ale_destroy(struct cpsw_ale *ale)
-{
-   if (!ale)
-   return -EINVAL;
-   kfree(ale);
-   return 0;
-}
-EXPORT_SYMBOL_GPL(cpsw_ale_destroy);
-
 void cpsw_ale_dump(struct cpsw_ale *ale, u32 *data)
 {
int i;
diff --git a/drivers/net/ethernet/ti/cpsw_ale.h 
b/drivers/net/ethernet/ti/cpsw_ale.h
index 25d24e8..d4fe901 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.h
+++ b/drivers/net/ethernet/ti/cpsw_ale.h
@@ -100,7 +100,6 @@ enum cpsw_ale_port_state {
 #define ALE_ENTRY_WORDSDIV_ROUND_UP(ALE_ENTRY_BITS, 32)
 
 struct cpsw_ale *cpsw_ale_create(struct cpsw_ale_params *params);
-int cpsw_ale_destroy(struct cpsw_ale *ale);
 
 void cpsw_ale_start(struct cpsw_ale *ale);
 void cpsw_ale_stop(struct cpsw_ale *ale);
diff --git a/drivers/net/ethernet/ti/netcp_ethss.c 
b/drivers/net/ethernet/ti/netcp_ethss.c
index 12765e4..56dbc0b 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -3689,7 +3689,6 @@ static int gbe_remove(struct netcp_device *netcp_device, 
void *inst_priv)

[PATCH net-next 09/11] net: ethernet: ti: ale: move static initialization in cpsw_ale_create()

2017-11-30 Thread Grygorii Strashko
Move static initialization from cpsw_ale_start() to cpsw_ale_create() as it
does not make much sence to perform static initializtion in
cpsw_ale_start() which is called everytime netif[s] is opened.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw_ale.c | 57 +++---
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw_ale.c 
b/drivers/net/ethernet/ti/cpsw_ale.c
index f8c523d..53aa721 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -779,8 +779,36 @@ static void cpsw_ale_timer(struct timer_list *t)
 
 void cpsw_ale_start(struct cpsw_ale *ale)
 {
+   cpsw_ale_control_set(ale, 0, ALE_ENABLE, 1);
+   cpsw_ale_control_set(ale, 0, ALE_CLEAR, 1);
+
+   timer_setup(>timer, cpsw_ale_timer, 0);
+   if (ale->ageout) {
+   ale->timer.expires = jiffies + ale->ageout;
+   add_timer(>timer);
+   }
+}
+EXPORT_SYMBOL_GPL(cpsw_ale_start);
+
+void cpsw_ale_stop(struct cpsw_ale *ale)
+{
+   del_timer_sync(>timer);
+   cpsw_ale_control_set(ale, 0, ALE_ENABLE, 0);
+}
+EXPORT_SYMBOL_GPL(cpsw_ale_stop);
+
+struct cpsw_ale *cpsw_ale_create(struct cpsw_ale_params *params)
+{
+   struct cpsw_ale *ale;
u32 rev, ale_entries;
 
+   ale = kzalloc(sizeof(*ale), GFP_KERNEL);
+   if (!ale)
+   return NULL;
+
+   ale->params = *params;
+   ale->ageout = ale->params.ale_ageout * HZ;
+
rev = readl_relaxed(ale->params.ale_regs + ALE_IDVER);
if (!ale->params.major_ver_mask)
ale->params.major_ver_mask = 0xff;
@@ -849,35 +877,6 @@ void cpsw_ale_start(struct cpsw_ale *ale)
ALE_UNKNOWNVLAN_FORCE_UNTAG_EGRESS;
}
 
-   cpsw_ale_control_set(ale, 0, ALE_ENABLE, 1);
-   cpsw_ale_control_set(ale, 0, ALE_CLEAR, 1);
-
-   timer_setup(>timer, cpsw_ale_timer, 0);
-   if (ale->ageout) {
-   ale->timer.expires = jiffies + ale->ageout;
-   add_timer(>timer);
-   }
-}
-EXPORT_SYMBOL_GPL(cpsw_ale_start);
-
-void cpsw_ale_stop(struct cpsw_ale *ale)
-{
-   del_timer_sync(>timer);
-   cpsw_ale_control_set(ale, 0, ALE_ENABLE, 0);
-}
-EXPORT_SYMBOL_GPL(cpsw_ale_stop);
-
-struct cpsw_ale *cpsw_ale_create(struct cpsw_ale_params *params)
-{
-   struct cpsw_ale *ale;
-
-   ale = kzalloc(sizeof(*ale), GFP_KERNEL);
-   if (!ale)
-   return NULL;
-
-   ale->params = *params;
-   ale->ageout = ale->params.ale_ageout * HZ;
-
return ale;
 }
 EXPORT_SYMBOL_GPL(cpsw_ale_create);
-- 
2.10.5



[PATCH net-next 07/11] net: ethernet: ti: ale: disable ale from stop()

2017-11-30 Thread Grygorii Strashko
ALE is enabled from cpsw_ale_start() now, but disabled only from
cpsw_ale_destroy() which introduces inconsitance as cpsw_ale_start() is
called when netif[s] is opened, but cpsw_ale_destroy() is called when
driver is removed. Hence, move ALE disabling in cpsw_ale_stop().

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw_ale.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/cpsw_ale.c 
b/drivers/net/ethernet/ti/cpsw_ale.c
index fca5c29..5eaaf88 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -870,6 +870,7 @@ EXPORT_SYMBOL_GPL(cpsw_ale_start);
 void cpsw_ale_stop(struct cpsw_ale *ale)
 {
del_timer_sync(>timer);
+   cpsw_ale_control_set(ale, 0, ALE_ENABLE, 0);
 }
 EXPORT_SYMBOL_GPL(cpsw_ale_stop);
 
@@ -892,7 +893,6 @@ int cpsw_ale_destroy(struct cpsw_ale *ale)
 {
if (!ale)
return -EINVAL;
-   cpsw_ale_control_set(ale, 0, ALE_ENABLE, 0);
kfree(ale);
return 0;
 }
-- 
2.10.5



[PATCH 3/3] make sock_alloc_file() do sock_release() on failures

2017-11-30 Thread Al Viro
This changes calling conventions (and simplifies the hell out
the callers).  New rules: once struct socket had been passed
to sock_alloc_file(), it's been consumed either by struct file
or by sock_release() done by sock_alloc_file().  Either way
the caller should not do sock_release() after that point.

Signed-off-by: Al Viro 
---
 drivers/staging/lustre/lnet/lnet/lib-socket.c |  8 ++--
 net/9p/trans_fd.c |  1 -
 net/kcm/kcmsock.c |  7 +--
 net/sctp/socket.c |  1 -
 net/socket.c  | 25 -
 5 files changed, 11 insertions(+), 31 deletions(-)

diff --git a/drivers/staging/lustre/lnet/lnet/lib-socket.c 
b/drivers/staging/lustre/lnet/lnet/lib-socket.c
index 539a26444f31..7d49d4865298 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-socket.c
@@ -71,16 +71,12 @@ lnet_sock_ioctl(int cmd, unsigned long arg)
}
 
sock_filp = sock_alloc_file(sock, 0, NULL);
-   if (IS_ERR(sock_filp)) {
-   sock_release(sock);
-   rc = PTR_ERR(sock_filp);
-   goto out;
-   }
+   if (IS_ERR(sock_filp))
+   return PTR_ERR(sock_filp);
 
rc = kernel_sock_unlocked_ioctl(sock_filp, cmd, arg);
 
fput(sock_filp);
-out:
return rc;
 }
 
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 985046ae4231..80f5c79053a4 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -839,7 +839,6 @@ static int p9_socket_open(struct p9_client *client, struct 
socket *csocket)
if (IS_ERR(file)) {
pr_err("%s (%d): failed to map fd\n",
   __func__, task_pid_nr(current));
-   sock_release(csocket);
kfree(p);
return PTR_ERR(file);
}
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index c5fa634e63ca..d4e98f20fc2a 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1629,7 +1629,6 @@ static struct file *kcm_clone(struct socket *osock)
 {
struct socket *newsock;
struct sock *newsk;
-   struct file *file;
 
newsock = sock_alloc();
if (!newsock)
@@ -1649,11 +1648,7 @@ static struct file *kcm_clone(struct socket *osock)
sock_init_data(newsock, newsk);
init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
 
-   file = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
-   if (IS_ERR(file))
-   sock_release(newsock);
-
-   return file;
+   return sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
 }
 
 static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 3204a9b29407..8bb5163d6331 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5080,7 +5080,6 @@ static int sctp_getsockopt_peeloff_common(struct sock 
*sk, sctp_peeloff_arg_t *p
*newfile = sock_alloc_file(newsock, 0, NULL);
if (IS_ERR(*newfile)) {
put_unused_fd(retval);
-   sock_release(newsock);
retval = PTR_ERR(*newfile);
*newfile = NULL;
return retval;
diff --git a/net/socket.c b/net/socket.c
index 2df83c0bfde9..05f361faec45 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -406,8 +406,10 @@ struct file *sock_alloc_file(struct socket *sock, int 
flags, const char *dname)
name.len = strlen(name.name);
}
path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, );
-   if (unlikely(!path.dentry))
+   if (unlikely(!path.dentry)) {
+   sock_release(sock);
return ERR_PTR(-ENOMEM);
+   }
path.mnt = mntget(sock_mnt);
 
d_instantiate(path.dentry, SOCK_INODE(sock));
@@ -415,9 +417,11 @@ struct file *sock_alloc_file(struct socket *sock, int 
flags, const char *dname)
file = alloc_file(, FMODE_READ | FMODE_WRITE,
  _file_ops);
if (IS_ERR(file)) {
-   /* drop dentry, keep inode */
+   /* drop dentry, keep inode for a bit */
ihold(d_inode(path.dentry));
path_put();
+   /* ... and now kill it properly */
+   sock_release(sock);
return file;
}
 
@@ -1330,19 +1334,9 @@ SYSCALL_DEFINE3(socket, int, family, int, type, int, 
protocol)
 
retval = sock_create(family, type, protocol, );
if (retval < 0)
-   goto out;
-
-   retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
-   if (retval < 0)
-   goto out_release;
-
-out:
-   /* It may be already another descriptor 8) Not kernel problem. */
-   return retval;
+   return retval;
 
-out_release:
-   sock_release(sock);
-   return retval;
+   return sock_map_fd(sock, flags & (O_CLOEXEC | 

[PATCH net-next 08/11] net: ethernet: ti: ale: optimize ale entry mask bits configuartion

2017-11-30 Thread Grygorii Strashko
The ale->params.ale_ports parameter can be used to deriver values for all
ale entry mask bits: port_mask_bits, port_mask_bits, port_num_bits.
Hence, calculate above values and drop all hardcoded values. For
port_num_bits calcualtion use order_base_2() API.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw_ale.c | 13 +++--
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw_ale.c 
b/drivers/net/ethernet/ti/cpsw_ale.c
index 5eaaf88..f8c523d 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -816,9 +816,9 @@ void cpsw_ale_start(struct cpsw_ale *ale)
 "ALE Table size %ld\n", ale->params.ale_entries);
 
/* set default bits for existing h/w */
-   ale->port_mask_bits = 3;
-   ale->port_num_bits = 2;
-   ale->vlan_field_bits = 3;
+   ale->port_mask_bits = ale->params.ale_ports;
+   ale->port_num_bits = order_base_2(ale->params.ale_ports);
+   ale->vlan_field_bits = ale->params.ale_ports;
 
/* Set defaults override for ALE on NetCP NU switch and for version
 * 1R3
@@ -847,13 +847,6 @@ void cpsw_ale_start(struct cpsw_ale *ale)
ale_controls[ALE_PORT_UNTAGGED_EGRESS].shift = 0;
ale_controls[ALE_PORT_UNTAGGED_EGRESS].offset =
ALE_UNKNOWNVLAN_FORCE_UNTAG_EGRESS;
-   ale->port_mask_bits = ale->params.ale_ports;
-   ale->port_num_bits = ale->params.ale_ports - 1;
-   ale->vlan_field_bits = ale->params.ale_ports;
-   } else if (ale->version == ALE_VERSION_1R3) {
-   ale->port_mask_bits = ale->params.ale_ports;
-   ale->port_num_bits = 3;
-   ale->vlan_field_bits = ale->params.ale_ports;
}
 
cpsw_ale_control_set(ale, 0, ALE_ENABLE, 1);
-- 
2.10.5



[PATCH 2/3] fix kcm_clone()

2017-11-30 Thread Al Viro
1) it's fput() or sock_release(), not both
2) don't do fd_install() until the last failure exit.
3) not a bug per se, but... don't attach socket to struct file
   until it's set up.

Take reserving descriptor into the caller, move fd_install() to the
caller, sanitize failure exits and calling conventions.

Signed-off-by: Al Viro 
---
 net/kcm/kcmsock.c | 71 +--
 1 file changed, 27 insertions(+), 44 deletions(-)

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 0b750a22c4b9..c5fa634e63ca 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1625,60 +1625,35 @@ static struct proto kcm_proto = {
 };
 
 /* Clone a kcm socket. */
-static int kcm_clone(struct socket *osock, struct kcm_clone *info,
-struct socket **newsockp)
+static struct file *kcm_clone(struct socket *osock)
 {
struct socket *newsock;
struct sock *newsk;
-   struct file *newfile;
-   int err, newfd;
+   struct file *file;
 
-   err = -ENFILE;
newsock = sock_alloc();
if (!newsock)
-   goto out;
+   return ERR_PTR(-ENFILE);
 
newsock->type = osock->type;
newsock->ops = osock->ops;
 
__module_get(newsock->ops->owner);
 
-   newfd = get_unused_fd_flags(0);
-   if (unlikely(newfd < 0)) {
-   err = newfd;
-   goto out_fd_fail;
-   }
-
-   newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
-   if (IS_ERR(newfile)) {
-   err = PTR_ERR(newfile);
-   goto out_sock_alloc_fail;
-   }
-
newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
 _proto, true);
if (!newsk) {
-   err = -ENOMEM;
-   goto out_sk_alloc_fail;
+   sock_release(newsock);
+   return ERR_PTR(-ENOMEM);
}
-
sock_init_data(newsock, newsk);
init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
 
-   fd_install(newfd, newfile);
-   *newsockp = newsock;
-   info->fd = newfd;
-
-   return 0;
+   file = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
+   if (IS_ERR(file))
+   sock_release(newsock);
 
-out_sk_alloc_fail:
-   fput(newfile);
-out_sock_alloc_fail:
-   put_unused_fd(newfd);
-out_fd_fail:
-   sock_release(newsock);
-out:
-   return err;
+   return file;
 }
 
 static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
@@ -1708,17 +1683,25 @@ static int kcm_ioctl(struct socket *sock, unsigned int 
cmd, unsigned long arg)
}
case SIOCKCMCLONE: {
struct kcm_clone info;
-   struct socket *newsock = NULL;
-
-   err = kcm_clone(sock, , );
-   if (!err) {
-   if (copy_to_user((void __user *)arg, ,
-sizeof(info))) {
-   err = -EFAULT;
-   sys_close(info.fd);
-   }
-   }
+   struct file *file;
+
+   info.fd = get_unused_fd_flags(0);
+   if (unlikely(info.fd < 0))
+   return info.fd;
 
+   file = kcm_clone(sock);
+   if (IS_ERR(file)) {
+   put_unused_fd(info.fd);
+   return PTR_ERR(file);
+   }
+   if (copy_to_user((void __user *)arg, ,
+sizeof(info))) {
+   put_unused_fd(info.fd);
+   fput(file);
+   return -EFAULT;
+   }
+   fd_install(info.fd, file);
+   err = 0;
break;
}
default:
-- 
2.11.0



[PATCH net-next 01/11] net: ethernet: ti: cpsw: drop unused var poll from cpsw_update_channels_res

2017-11-30 Thread Grygorii Strashko
Drop unused variable "poll" from cpsw_update_channels_res().

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index a73600d..523c110 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -2298,7 +2298,6 @@ static int cpsw_check_ch_settings(struct cpsw_common 
*cpsw,
 
 static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx)
 {
-   int (*poll)(struct napi_struct *, int);
struct cpsw_common *cpsw = priv->cpsw;
void (*handler)(void *, int, int);
struct netdev_queue *queue;
@@ -2309,12 +2308,10 @@ static int cpsw_update_channels_res(struct cpsw_priv 
*priv, int ch_num, int rx)
ch = >rx_ch_num;
vec = cpsw->rxv;
handler = cpsw_rx_handler;
-   poll = cpsw_rx_poll;
} else {
ch = >tx_ch_num;
vec = cpsw->txv;
handler = cpsw_tx_handler;
-   poll = cpsw_tx_poll;
}
 
while (*ch < ch_num) {
-- 
2.10.5



[PATCH 1/3] socketpair(): allocate descriptors first

2017-11-30 Thread Al Viro
simplifies failure exits considerably...

Signed-off-by: Al Viro 
---
 net/socket.c | 89 ++--
 1 file changed, 38 insertions(+), 51 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index 42d8e9c9ccd5..2df83c0bfde9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1366,87 +1366,74 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, 
int, protocol,
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
 
/*
+* reserve descriptors and make sure we won't fail
+* to return them to userland.
+*/
+   fd1 = get_unused_fd_flags(flags);
+   if (unlikely(fd1 < 0))
+   return fd1;
+
+   fd2 = get_unused_fd_flags(flags);
+   if (unlikely(fd2 < 0)) {
+   put_unused_fd(fd1);
+   return fd2;
+   }
+
+   err = put_user(fd1, [0]);
+   if (err)
+   goto out;
+
+   err = put_user(fd2, [1]);
+   if (err)
+   goto out;
+
+   /*
 * Obtain the first socket and check if the underlying protocol
 * supports the socketpair call.
 */
 
err = sock_create(family, type, protocol, );
-   if (err < 0)
+   if (unlikely(err < 0))
goto out;
 
err = sock_create(family, type, protocol, );
-   if (err < 0)
-   goto out_release_1;
-
-   err = sock1->ops->socketpair(sock1, sock2);
-   if (err < 0)
-   goto out_release_both;
-
-   fd1 = get_unused_fd_flags(flags);
-   if (unlikely(fd1 < 0)) {
-   err = fd1;
-   goto out_release_both;
+   if (unlikely(err < 0)) {
+   sock_release(sock1);
+   goto out;
}
 
-   fd2 = get_unused_fd_flags(flags);
-   if (unlikely(fd2 < 0)) {
-   err = fd2;
-   goto out_put_unused_1;
+   err = sock1->ops->socketpair(sock1, sock2);
+   if (unlikely(err < 0)) {
+   sock_release(sock2);
+   sock_release(sock1);
+   goto out;
}
 
newfile1 = sock_alloc_file(sock1, flags, NULL);
if (IS_ERR(newfile1)) {
err = PTR_ERR(newfile1);
-   goto out_put_unused_both;
+   sock_release(sock1);
+   sock_release(sock2);
+   goto out;
}
 
newfile2 = sock_alloc_file(sock2, flags, NULL);
if (IS_ERR(newfile2)) {
err = PTR_ERR(newfile2);
-   goto out_fput_1;
+   sock_release(sock2);
+   fput(newfile1);
+   goto out;
}
 
-   err = put_user(fd1, [0]);
-   if (err)
-   goto out_fput_both;
-
-   err = put_user(fd2, [1]);
-   if (err)
-   goto out_fput_both;
-
audit_fd_pair(fd1, fd2);
 
fd_install(fd1, newfile1);
fd_install(fd2, newfile2);
-   /* fd1 and fd2 may be already another descriptors.
-* Not kernel problem.
-*/
-
return 0;
 
-out_fput_both:
-   fput(newfile2);
-   fput(newfile1);
-   put_unused_fd(fd2);
-   put_unused_fd(fd1);
-   goto out;
-
-out_fput_1:
-   fput(newfile1);
-   put_unused_fd(fd2);
-   put_unused_fd(fd1);
-   sock_release(sock2);
-   goto out;
-
-out_put_unused_both:
+out:
put_unused_fd(fd2);
-out_put_unused_1:
put_unused_fd(fd1);
-out_release_both:
-   sock_release(sock2);
-out_release_1:
-   sock_release(sock1);
-out:
return err;
 }
 
-- 
2.11.0



[PATCH net-next 11/11] net: ethernet: ti: ale: fix port check in cpsw_ale_control_set/get

2017-11-30 Thread Grygorii Strashko
ALE ports number includes the Host port and ext Ports, and
ALE ports numbering starts from 0, so correct corresponding port
checks in cpsw_ale_control_set/get().

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw_ale.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw_ale.c 
b/drivers/net/ethernet/ti/cpsw_ale.c
index e513248..93dc05c 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -723,7 +723,7 @@ int cpsw_ale_control_set(struct cpsw_ale *ale, int port, 
int control,
if (info->port_offset == 0 && info->port_shift == 0)
port = 0; /* global, port is a dont care */
 
-   if (port < 0 || port > ale->params.ale_ports)
+   if (port < 0 || port >= ale->params.ale_ports)
return -EINVAL;
 
mask = BITMASK(info->bits);
@@ -754,7 +754,7 @@ int cpsw_ale_control_get(struct cpsw_ale *ale, int port, 
int control)
if (info->port_offset == 0 && info->port_shift == 0)
port = 0; /* global, port is a dont care */
 
-   if (port < 0 || port > ale->params.ale_ports)
+   if (port < 0 || port >= ale->params.ale_ports)
return -EINVAL;
 
offset = info->offset + (port * info->port_offset);
-- 
2.10.5



[RFC][PATCHES] sock_alloc_file() cleanups and fixes

2017-11-30 Thread Al Viro
Almost all sock_alloc_file() callers want sock_release()
in case of failure.  Currently it consumes socket on success
(it will be destroyed on final fput() of resulting file) and
leaves it alone on failure.  Making it consume socket in all
cases makes for simpler life in callers.

There are 3 exceptions:

* sock_map_fd() calls sock_alloc_file(), but doesn't do sock_release()
  in case of failure.  Its caller (sys_socket()) does, though, and it
  does get considerably simpler with sock_alloc_file() doing the cleanup
  in case of failure.

* sys_socketpair().  Handling of sock_alloc_file() failures is complicated
  by attempts to share bits and pieces of failure exits between various
  points of failure in there.  Reordering things a bit (reserving descriptors
  and copying them to userland before doing anything else) makes for simpler
  handling of failure exits and after such massage we get the situation
  when failure of sock_alloc_file() is immediately followed by sock_release().

* kcm_clone().  Badly broken in several respects - sk_alloc() failure ends
  up with double-free of struct socket (we do fput(), then sock_release())
  and copy_to_user() failure uses sys_close() to undo fd_install(), which
  is something we should never do.  Descriptor table might be shared and
  fd_install() should only be done past the last possible failure point.
  Fixing all of that is simple - we just need to move allocation of
  descriptor and fd_install() into the caller (before and after the call of
  kcm_clone(), resp.) and untangle the failure exits.  Makes for much simpler
  calling conventions for kcm_clone(), while we are at it, and as a side
  effect we get "sock_release() in case of sock_alloc_file() failure" for
  that one as well.

The patch series (in followups to this mail and in
git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git#work.net) does
the following:
1) massage sys_socketpair() (should be a pure cleanup)
2) fix and clean up kcm_clone() (-stable fodder)
3) switch sock_alloc_file() to new calling conventions.

It got some local testing, but it certainly needs more review.
Diffstat for the entire thing is
 drivers/staging/lustre/lnet/lnet/lib-socket.c |   8 ++---
 net/9p/trans_fd.c |   1 -
 net/kcm/kcmsock.c |  68 
++---
 net/sctp/socket.c |   1 -
 net/socket.c  | 110 
+++
 5 files changed, 69 insertions(+), 119 deletions(-)

Please, review and comment.


[PATCH net-next 2/5] rhashtable: Add rhastable_walk_peek

2017-11-30 Thread Tom Herbert
This function is like rhashtable_walk_next except that it only returns
the current element in the inter and does not advance the iter.

This patch also creates __rhashtable_walk_find_next. It finds the next
element in the table when the entry cached in iter is NULL or at the end
of a slot. __rhashtable_walk_find_next is called from
rhashtable_walk_next and rhastable_walk_peek.

Signed-off-by: Tom Herbert 
---
 include/linux/rhashtable.h |   1 +
 lib/rhashtable.c   | 103 ++---
 2 files changed, 88 insertions(+), 16 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 4c976bf320a8..7f3e674e127a 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -380,6 +380,7 @@ void rhashtable_walk_enter(struct rhashtable *ht,
 void rhashtable_walk_exit(struct rhashtable_iter *iter);
 void rhashtable_walk_start(struct rhashtable_iter *iter) __acquires(RCU);
 void *rhashtable_walk_next(struct rhashtable_iter *iter);
+void *rhashtable_walk_peek(struct rhashtable_iter *iter);
 void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
 
 void rhashtable_free_and_destroy(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index eeddfb3199cd..1d58231110af 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -753,18 +753,16 @@ void rhashtable_walk_start(struct rhashtable_iter *iter)
 EXPORT_SYMBOL_GPL(rhashtable_walk_start);
 
 /**
- * rhashtable_walk_next - Return the next object and advance the iterator
+ * __rhashtable_walk_find_next - Find the next element in a table (or the first
+ * one in case of a new walk).
+ *
  * @iter:  Hash table iterator
  *
- * Note that you must call rhashtable_walk_stop when you are finished
- * with the walk.
+ * Returns the found object or NULL when the end of the table is reached.
  *
- * Returns the next object or NULL when the end of the table is reached.
- *
- * Returns -EAGAIN if resize event occured.  Note that the iterator
- * will rewind back to the beginning and you may continue to use it.
+ * Returns -EAGAIN if resize event occurred.
  */
-void *rhashtable_walk_next(struct rhashtable_iter *iter)
+static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
 {
struct bucket_table *tbl = iter->walker.tbl;
struct rhlist_head *list = iter->list;
@@ -772,14 +770,6 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
struct rhash_head *p = iter->p;
bool rhlist = ht->rhlist;
 
-   if (p) {
-   if (!rhlist || !(list = rcu_dereference(list->next))) {
-   p = rcu_dereference(p->next);
-   list = container_of(p, struct rhlist_head, rhead);
-   }
-   goto next;
-   }
-
for (; iter->slot < tbl->size; iter->slot++) {
int skip = iter->skip;
 
@@ -826,9 +816,90 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 
return NULL;
 }
+
+/**
+ * rhashtable_walk_next - Return the next object and advance the iterator
+ * @iter:  Hash table iterator
+ *
+ * Note that you must call rhashtable_walk_stop when you are finished
+ * with the walk.
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.  Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_next(struct rhashtable_iter *iter)
+{
+   struct rhlist_head *list = iter->list;
+   struct rhashtable *ht = iter->ht;
+   struct rhash_head *p = iter->p;
+   bool rhlist = ht->rhlist;
+
+   if (!iter->walker.tbl)
+   return NULL;
+
+   if (p) {
+   if (!rhlist || !(list = rcu_dereference(list->next))) {
+   p = rcu_dereference(p->next);
+   list = container_of(p, struct rhlist_head, rhead);
+   }
+   if (!rht_is_a_nulls(p)) {
+   iter->skip++;
+   iter->p = p;
+   iter->list = list;
+   return rht_obj(ht, rhlist ? >rhead : p);
+   }
+
+   /* At the end of this slot, switch to next one and then find
+* next entry from that point.
+*/
+   iter->skip = 0;
+   iter->slot++;
+   }
+
+   return __rhashtable_walk_find_next(iter);
+}
 EXPORT_SYMBOL_GPL(rhashtable_walk_next);
 
 /**
+ * rhashtable_walk_peek - Return the next object but don't advance the iterator
+ * @iter:  Hash table iterator
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.  Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_peek(struct rhashtable_iter *iter)
+{
+   struct rhlist_head *list = 

[PATCH net-next 4/5] spinlock: Add library function to allocate spinlock buckets array

2017-11-30 Thread Tom Herbert
Add two new library functions: alloc_bucket_spinlocks and
free_bucket_spinlocks. These are used to allocate and free an array
of spinlocks that are useful as locks for hash buckets. The interface
specifies the maximum number of spinlocks in the array as well
as a CPU multiplier to derive the number of spinlocks to allocate.
The number allocated is rounded up to a power of two to make the
array amenable to hash lookup.

Signed-off-by: Tom Herbert 
---
 include/linux/spinlock.h |  6 ++
 lib/Makefile |  2 +-
 lib/bucket_locks.c   | 54 
 3 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100644 lib/bucket_locks.c

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index a39186194cd6..10fd28b118ee 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -414,4 +414,10 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, 
spinlock_t *lock);
 #define atomic_dec_and_lock(atomic, lock) \
__cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
 
+int alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask,
+  size_t max_size, unsigned int cpu_mult,
+  gfp_t gfp);
+
+void free_bucket_spinlocks(spinlock_t *locks);
+
 #endif /* __LINUX_SPINLOCK_H */
diff --git a/lib/Makefile b/lib/Makefile
index d11c48ec8ffd..a6c8529dd9b2 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -39,7 +39,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o 
random32.o \
 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
-once.o refcount.o usercopy.o errseq.o
+once.o refcount.o usercopy.o errseq.o bucket_locks.o
 obj-$(CONFIG_STRING_SELFTEST) += test_string.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
diff --git a/lib/bucket_locks.c b/lib/bucket_locks.c
new file mode 100644
index ..266a97c5708b
--- /dev/null
+++ b/lib/bucket_locks.c
@@ -0,0 +1,54 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* Allocate an array of spinlocks to be accessed by a hash. Two arguments
+ * indicate the number of elements to allocate in the array. max_size
+ * gives the maximum number of elements to allocate. cpu_mult gives
+ * the number of locks per CPU to allocate. The size is rounded up
+ * to a power of 2 to be suitable as a hash table.
+ */
+
+int alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *locks_mask,
+  size_t max_size, unsigned int cpu_mult, gfp_t gfp)
+{
+   spinlock_t *tlocks = NULL;
+   unsigned int i, size;
+#if defined(CONFIG_PROVE_LOCKING)
+   unsigned int nr_pcpus = 2;
+#else
+   unsigned int nr_pcpus = num_possible_cpus();
+#endif
+
+   if (cpu_mult) {
+   nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL);
+   size = min_t(unsigned int, nr_pcpus * cpu_mult, max_size);
+   } else {
+   size = max_size;
+   }
+
+   if (sizeof(spinlock_t) != 0) {
+   if (gfpflags_allow_blocking(gfp))
+   tlocks = kvmalloc(size * sizeof(spinlock_t), gfp);
+   else
+   tlocks = kmalloc_array(size, sizeof(spinlock_t), gfp);
+   if (!tlocks)
+   return -ENOMEM;
+   for (i = 0; i < size; i++)
+   spin_lock_init([i]);
+   }
+
+   *locks = tlocks;
+   *locks_mask = size - 1;
+
+   return 0;
+}
+EXPORT_SYMBOL(alloc_bucket_spinlocks);
+
+void free_bucket_spinlocks(spinlock_t *locks)
+{
+   kvfree(locks);
+}
+EXPORT_SYMBOL(free_bucket_spinlocks);
-- 
2.11.0



[PATCH net-next 1/5] rhashtable: Don't reset walker table in rhashtable_walk_start

2017-11-30 Thread Tom Herbert
Remove the code that resets the walker table. The walker table should
only be initialized in the walk init function or when a future table is
encountered. If the walker table is NULL this is the indication that
the walk has completed and this information can be used to break a
multi-call walk in the table (e.g. successive calls to nelink_dump
that are dumping elements of an rhashtable).

This also allows us to change rhashtable_walk_start to return void
since the only error it was returning was -EAGAIN for a table change.
This patch changes all the callers of rhashtable_walk_start to expect
void which eliminates logic needed to check the return value for a
rare condition. Note that -EAGAIN will be returned in a call
to rhashtable_walk_next which seems to always follow the start
of the walk so there should be no behavioral change in doing this.

Signed-off-by: Tom Herbert 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c   |  6 +---
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c   |  7 ++---
 fs/gfs2/glock.c|  8 ++---
 include/linux/rhashtable.h |  2 +-
 include/net/sctp/sctp.h|  2 +-
 lib/rhashtable.c   | 18 ++--
 lib/test_rhashtable.c  |  6 +---
 net/ipv6/ila/ila_xlat.c|  4 +--
 net/ipv6/seg6.c|  4 +--
 net/mac80211/mesh_pathtbl.c| 34 +++---
 net/netfilter/nft_set_hash.c   | 10 ++-
 net/netlink/af_netlink.c   |  5 ++--
 net/netlink/diag.c |  8 ++---
 net/sctp/proc.c|  6 +---
 net/sctp/socket.c  | 19 +++-
 net/tipc/socket.c  |  6 ++--
 16 files changed, 37 insertions(+), 108 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index d5031f436f83..df6a57087848 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -1416,11 +1416,7 @@ bnxt_tc_flow_stats_batch_prep(struct bnxt *bp,
void *flow_node;
int rc, i;
 
-   rc = rhashtable_walk_start(iter);
-   if (rc && rc != -EAGAIN) {
-   i = 0;
-   goto done;
-   }
+   rhashtable_walk_start(iter);
 
rc = 0;
for (i = 0; i < BNXT_FLOW_STATS_BATCH_MAX; i++) {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index d4a548a6a55c..6d7a10d0c45e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -765,9 +765,7 @@ static void ch_flower_stats_handler(struct work_struct 
*work)
 
rhashtable_walk_enter(>flower_tbl, );
do {
-   flower_entry = ERR_PTR(rhashtable_walk_start());
-   if (IS_ERR(flower_entry))
-   goto walk_stop;
+   rhashtable_walk_start();
 
while ((flower_entry = rhashtable_walk_next()) &&
   !IS_ERR(flower_entry)) {
@@ -786,8 +784,9 @@ static void ch_flower_stats_handler(struct work_struct 
*work)
spin_unlock(_entry->lock);
}
}
-walk_stop:
+
rhashtable_walk_stop();
+
} while (flower_entry == ERR_PTR(-EAGAIN));
rhashtable_walk_exit();
mod_timer(>flower_stats_timer, jiffies + STATS_CHECK_PERIOD);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 11066d8647d2..20d1b6e2d829 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1549,16 +1549,13 @@ static void glock_hash_walk(glock_examiner examiner, 
const struct gfs2_sbd *sdp)
rhashtable_walk_enter(_hash_table, );
 
do {
-   gl = ERR_PTR(rhashtable_walk_start());
-   if (IS_ERR(gl))
-   goto walk_stop;
+   rhashtable_walk_start();
 
while ((gl = rhashtable_walk_next()) && !IS_ERR(gl))
if (gl->gl_name.ln_sbd == sdp &&
lockref_get_not_dead(>gl_lockref))
examiner(gl);
 
-walk_stop:
rhashtable_walk_stop();
} while (cond_resched(), gl == ERR_PTR(-EAGAIN));
 
@@ -1947,8 +1944,7 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, 
loff_t *pos)
loff_t n = *pos;
 
rhashtable_walk_enter(_hash_table, >hti);
-   if (rhashtable_walk_start(>hti) != 0)
-   return NULL;
+   rhashtable_walk_start(>hti);
 
do {
gfs2_glock_iter_next(gi);
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 361c08e35dbc..4c976bf320a8 100644
--- 

[PATCH net-next 5/5] rhashtable: Call library function alloc_bucket_locks

2017-11-30 Thread Tom Herbert
To allocate the array of bucket locks for the hash table we now
call library function alloc_bucket_spinlocks. This function is
based on the old alloc_bucket_locks in rhashtable and should
produce the same effect.

Signed-off-by: Tom Herbert 
---
 lib/rhashtable.c | 47 ---
 1 file changed, 8 insertions(+), 39 deletions(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 1d58231110af..a9c04e5e4767 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -65,42 +65,6 @@ EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
 #define ASSERT_RHT_MUTEX(HT)
 #endif
 
-
-static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
- gfp_t gfp)
-{
-   unsigned int i, size;
-#if defined(CONFIG_PROVE_LOCKING)
-   unsigned int nr_pcpus = 2;
-#else
-   unsigned int nr_pcpus = num_possible_cpus();
-#endif
-
-   nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL);
-   size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
-
-   /* Never allocate more than 0.5 locks per bucket */
-   size = min_t(unsigned int, size, tbl->size >> 1);
-
-   if (tbl->nest)
-   size = min(size, 1U << tbl->nest);
-
-   if (sizeof(spinlock_t) != 0) {
-   if (gfpflags_allow_blocking(gfp))
-   tbl->locks = kvmalloc(size * sizeof(spinlock_t), gfp);
-   else
-   tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
-  gfp);
-   if (!tbl->locks)
-   return -ENOMEM;
-   for (i = 0; i < size; i++)
-   spin_lock_init(>locks[i]);
-   }
-   tbl->locks_mask = size - 1;
-
-   return 0;
-}
-
 static void nested_table_free(union nested_table *ntbl, unsigned int size)
 {
const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
@@ -140,7 +104,7 @@ static void bucket_table_free(const struct bucket_table 
*tbl)
if (tbl->nest)
nested_bucket_table_free(tbl);
 
-   kvfree(tbl->locks);
+   free_bucket_spinlocks(tbl->locks);
kvfree(tbl);
 }
 
@@ -207,7 +171,7 @@ static struct bucket_table *bucket_table_alloc(struct 
rhashtable *ht,
   gfp_t gfp)
 {
struct bucket_table *tbl = NULL;
-   size_t size;
+   size_t size, max_locks;
int i;
 
size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
@@ -227,7 +191,12 @@ static struct bucket_table *bucket_table_alloc(struct 
rhashtable *ht,
 
tbl->size = size;
 
-   if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
+   max_locks = size >> 1;
+   if (tbl->nest)
+   max_locks = min_t(size_t, max_locks, 1U << tbl->nest);
+
+   if (alloc_bucket_spinlocks(>locks, >locks_mask, max_locks,
+  ht->p.locks_mul, gfp) < 0) {
bucket_table_free(tbl);
return NULL;
}
-- 
2.11.0



  1   2   3   4   >