[PATCH][net-next] netlink: avoid to allocate full skb when sending to many devices

2018-09-17 Thread Li RongQing
if skb->head is vmalloc address, when this skb is delivered, full
allocation for this skb is required, if there are many devices,
the full allocation will be called for every devices

now using the first time allocated skb when iterate other devices
to send, reduce full allocation and speedup deliver.

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 net/netlink/af_netlink.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e3a0538ec0be..095b99e3c1fb 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -278,11 +278,11 @@ static bool netlink_filter_tap(const struct sk_buff *skb)
return false;
 }
 
-static int __netlink_deliver_tap_skb(struct sk_buff *skb,
+static int __netlink_deliver_tap_skb(struct sk_buff **skb,
 struct net_device *dev)
 {
struct sk_buff *nskb;
-   struct sock *sk = skb->sk;
+   struct sock *sk = (*skb)->sk;
int ret = -ENOMEM;
 
if (!net_eq(dev_net(dev), sock_net(sk)))
@@ -290,10 +290,12 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
 
dev_hold(dev);
 
-   if (is_vmalloc_addr(skb->head))
-   nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
+   if (is_vmalloc_addr((*skb)->head)) {
+   nskb = netlink_to_full_skb(*skb, GFP_ATOMIC);
+   *skb = nskb;
+   }
else
-   nskb = skb_clone(skb, GFP_ATOMIC);
+   nskb = skb_clone(*skb, GFP_ATOMIC);
if (nskb) {
nskb->dev = dev;
nskb->protocol = htons((u16) sk->sk_protocol);
@@ -318,7 +320,7 @@ static void __netlink_deliver_tap(struct sk_buff *skb, 
struct netlink_tap_net *n
return;
 
list_for_each_entry_rcu(tmp, >netlink_tap_all, list) {
-   ret = __netlink_deliver_tap_skb(skb, tmp->dev);
+   ret = __netlink_deliver_tap_skb(, tmp->dev);
if (unlikely(ret))
break;
}
-- 
2.16.2



Re: [bpf PATCH v3 3/3] bpf: test_maps, only support ESTABLISHED socks

2018-09-17 Thread Y Song
On Mon, Sep 17, 2018 at 9:38 PM John Fastabend  wrote:
>
> Ensure that sockets added to a sock{map|hash} that is not in the
> ESTABLISHED state is rejected.
>
> Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
> Signed-off-by: John Fastabend 

Acked-by: Yonghong Song 


[PATCH] netfilter: nft_osf: use enum nft_data_types for nft_validate_register_store

2018-09-17 Thread Stefan Agner
The function nft_validate_register_store requires a struct of type
struct nft_data_types. NFTA_DATA_VALUE is of type enum
nft_verdict_attributes. Pass the correct enum type.

This fixes a warning seen with Clang:
  net/netfilter/nft_osf.c:52:8: warning: implicit conversion from
enumeration type 'enum nft_data_attributes' to different enumeration
type 'enum nft_data_types' [-Wenum-conversion]
  NFTA_DATA_VALUE, NFT_OSF_MAXGENRELEN);
  ^~~

Link: https://github.com/ClangBuiltLinux/linux/issues/103
Signed-off-by: Stefan Agner 
---
 net/netfilter/nft_osf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index 5af74b37f423..a35fb59ace73 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -49,7 +49,7 @@ static int nft_osf_init(const struct nft_ctx *ctx,
 
priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]);
err = nft_validate_register_store(ctx, priv->dreg, NULL,
- NFTA_DATA_VALUE, NFT_OSF_MAXGENRELEN);
+ NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN);
if (err < 0)
return err;
 
-- 
2.19.0



Re: [bpf PATCH v3 2/3] bpf: sockmap, fix transition through disconnect without close

2018-09-17 Thread Y Song
On Mon, Sep 17, 2018 at 9:39 PM John Fastabend  wrote:
>
> It is possible (via shutdown()) for TCP socks to go trough TCP_CLOSE
> state via tcp_disconnect() without actually calling tcp_close which
> would then call our bpf_tcp_close() callback. Because of this a user
> could disconnect a socket then put it in a LISTEN state which would
> break our assumptions about sockets always being ESTABLISHED state.
>
> To resolve this rely on the unhash hook, which is called in the
> disconnect case, to remove the sock from the sockmap.
>
> Reported-by: Eric Dumazet 
> Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
> Signed-off-by: John Fastabend 
> ---
>  0 files changed
>
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 1f97b55..5680d65 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -132,6 +132,7 @@ struct smap_psock {
> struct work_struct gc_work;
>
> struct proto *sk_proto;
> +   void (*save_unhash)(struct sock *sk);
> void (*save_close)(struct sock *sk, long timeout);
> void (*save_data_ready)(struct sock *sk);
> void (*save_write_space)(struct sock *sk);
> @@ -143,6 +144,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr 
> *msg, size_t len,
>  static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
>  static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
> int offset, size_t size, int flags);
> +static void bpf_tcp_unhash(struct sock *sk);
>  static void bpf_tcp_close(struct sock *sk, long timeout);
>
>  static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
> @@ -184,6 +186,7 @@ static void build_protos(struct proto 
> prot[SOCKMAP_NUM_CONFIGS],
>  struct proto *base)
>  {
> prot[SOCKMAP_BASE]  = *base;
> +   prot[SOCKMAP_BASE].unhash   = bpf_tcp_unhash;
> prot[SOCKMAP_BASE].close= bpf_tcp_close;
> prot[SOCKMAP_BASE].recvmsg  = bpf_tcp_recvmsg;
> prot[SOCKMAP_BASE].stream_memory_read   = bpf_tcp_stream_read;
> @@ -217,6 +220,7 @@ static int bpf_tcp_init(struct sock *sk)
> return -EBUSY;
> }
>
> +   psock->save_unhash = sk->sk_prot->unhash;
> psock->save_close = sk->sk_prot->close;
> psock->sk_proto = sk->sk_prot;
>
> @@ -305,30 +309,12 @@ static struct smap_psock_map_entry 
> *psock_map_pop(struct sock *sk,
> return e;
>  }
>
> -static void bpf_tcp_close(struct sock *sk, long timeout)
> +static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
>  {
> -   void (*close_fun)(struct sock *sk, long timeout);
> struct smap_psock_map_entry *e;
> struct sk_msg_buff *md, *mtmp;
> -   struct smap_psock *psock;
> struct sock *osk;
>
> -   lock_sock(sk);
> -   rcu_read_lock();
> -   psock = smap_psock_sk(sk);
> -   if (unlikely(!psock)) {
> -   rcu_read_unlock();
> -   release_sock(sk);
> -   return sk->sk_prot->close(sk, timeout);
> -   }
> -
> -   /* The psock may be destroyed anytime after exiting the RCU critial
> -* section so by the time we use close_fun the psock may no longer
> -* be valid. However, bpf_tcp_close is called with the sock lock
> -* held so the close hook and sk are still valid.
> -*/
> -   close_fun = psock->save_close;
> -
> if (psock->cork) {
> free_start_sg(psock->sock, psock->cork, true);
> kfree(psock->cork);
> @@ -379,6 +365,42 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
> kfree(e);
> e = psock_map_pop(sk, psock);
> }
> +}
> +
> +static void bpf_tcp_unhash(struct sock *sk)
> +{
> +   void (*unhash_fun)(struct sock *sk);
> +   struct smap_psock *psock;
> +
> +   rcu_read_lock();
> +   psock = smap_psock_sk(sk);
> +   if (unlikely(!psock)) {
> +   rcu_read_unlock();
> +   if (sk->sk_prot->unhash)
> +   return sk->sk_prot->unhash(sk);
> +   return;
Nit: the above code can be
if (sk->sk_prot->unhash)
   sk->sk_prot->unhash(sk);
return;
Other than this,
Acked-by: Yonghong Song 

> +   }
> +   unhash_fun = psock->save_unhash;
> +   bpf_tcp_remove(sk, psock);
> +   rcu_read_unlock();
> +   unhash_fun(sk);
> +}
> +
> +static void bpf_tcp_close(struct sock *sk, long timeout)
> +{
> +   void (*close_fun)(struct sock *sk, long timeout);
> +   struct smap_psock *psock;
> +
> +   lock_sock(sk);
> +   rcu_read_lock();
> +   psock = smap_psock_sk(sk);
> +   if (unlikely(!psock)) {
> +   rcu_read_unlock();
> +   release_sock(sk);
> +   return sk->sk_prot->close(sk, timeout);
> +   }
> +   close_fun = psock->save_close;
> +   

[PATCH bpf-next] samples/bpf: fix a compilation failure

2018-09-17 Thread Yonghong Song
samples/bpf build failed with the following errors:

  $ make samples/bpf/
  ...
  HOSTCC  samples/bpf/sockex3_user.o
  /data/users/yhs/work/net-next/samples/bpf/sockex3_user.c:16:8: error: 
redefinition of ‘struct bpf_flow_keys’
   struct bpf_flow_keys {
  ^
  In file included from 
/data/users/yhs/work/net-next/samples/bpf/sockex3_user.c:4:0:
  ./usr/include/linux/bpf.h:2338:9: note: originally defined here
struct bpf_flow_keys *flow_keys;
   ^
  make[3]: *** [samples/bpf/sockex3_user.o] Error 1

Commit d58e468b1112d ("flow_dissector: implements flow dissector BPF hook")
introduced struct bpf_flow_keys in include/uapi/linux/bpf.h and hence
caused the naming conflict with samples/bpf/sockex3_user.c.

The fix is to rename struct bpf_flow_keys in samples/bpf/sockex3_user.c
to flow_keys to avoid the conflict.

Signed-off-by: Yonghong Song 
---
 samples/bpf/sockex3_user.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c
index 5ba3ae9d180b..22f74d0e1493 100644
--- a/samples/bpf/sockex3_user.c
+++ b/samples/bpf/sockex3_user.c
@@ -13,7 +13,7 @@
 #define PARSE_IP_PROG_FD (prog_fd[0])
 #define PROG_ARRAY_FD (map_fd[0])
 
-struct bpf_flow_keys {
+struct flow_keys {
__be32 src;
__be32 dst;
union {
@@ -64,7 +64,7 @@ int main(int argc, char **argv)
(void) f;
 
for (i = 0; i < 5; i++) {
-   struct bpf_flow_keys key = {}, next_key;
+   struct flow_keys key = {}, next_key;
struct pair value;
 
sleep(1);
-- 
2.17.1



[RFC PATCH bpf-next v3 7/7] selftests/bpf: add test cases for queue and stack maps

2018-09-17 Thread Mauricio Vasquez B
Two types of tests are done:
- test_maps: only userspace api.
- test_progs: userspace api and ebpf helpers.

Signed-off-by: Mauricio Vasquez B 
---
 kernel/bpf/helpers.c   |2 
 tools/lib/bpf/bpf.c|   12 ++
 tools/lib/bpf/bpf.h|1 
 tools/testing/selftests/bpf/Makefile   |5 +
 tools/testing/selftests/bpf/bpf_helpers.h  |7 +
 tools/testing/selftests/bpf/test_maps.c|  130 
 tools/testing/selftests/bpf/test_progs.c   |   99 +++
 tools/testing/selftests/bpf/test_queue_map.c   |4 +
 tools/testing/selftests/bpf/test_queue_stack_map.h |   59 +
 tools/testing/selftests/bpf/test_stack_map.c   |4 +
 10 files changed, 321 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_queue_map.c
 create mode 100644 tools/testing/selftests/bpf/test_queue_stack_map.h
 create mode 100644 tools/testing/selftests/bpf/test_stack_map.c

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 5f364e6acaf1..1293cd5240e3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -76,7 +76,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = {
.arg2_type  = ARG_PTR_TO_MAP_KEY,
 };
 
-BPF_CALL_4(bpf_map_push_elem, struct bpf_map *, map, void *, value, u32 size,
+BPF_CALL_4(bpf_map_push_elem, struct bpf_map *, map, void *, value, u32, size,
   u64, flags)
 {
if (map->value_size != size)
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 60aa4ca8b2c5..7056b2eb554d 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -286,6 +286,18 @@ int bpf_map_lookup_elem(int fd, const void *key, void 
*value)
return sys_bpf(BPF_MAP_LOOKUP_ELEM, , sizeof(attr));
 }
 
+int bpf_map_lookup_and_delete_elem(int fd, const void *key, const void *value)
+{
+   union bpf_attr attr;
+
+   bzero(, sizeof(attr));
+   attr.map_fd = fd;
+   attr.key = ptr_to_u64(key);
+   attr.value = ptr_to_u64(value);
+
+   return sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, , sizeof(attr));
+}
+
 int bpf_map_delete_elem(int fd, const void *key)
 {
union bpf_attr attr;
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 6f38164b2618..6134ed9517d3 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -86,6 +86,7 @@ int bpf_map_update_elem(int fd, const void *key, const void 
*value,
__u64 flags);
 
 int bpf_map_lookup_elem(int fd, const void *key, void *value);
+int bpf_map_lookup_and_delete_elem(int fd, const void *key, const void *value);
 int bpf_map_delete_elem(int fd, const void *key);
 int bpf_map_get_next_key(int fd, const void *key, void *next_key);
 int bpf_obj_pin(int fd, const char *pathname);
diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index fff7fb1285fc..ad8a2b8fb738 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -35,7 +35,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o 
test_tcp_estats.o test
test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o 
test_lirc_mode2_kern.o \
get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
-   test_skb_cgroup_id_kern.o
+   test_skb_cgroup_id_kern.o test_queue_map.o test_stack_map.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -110,6 +110,9 @@ CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \
 $(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline
 $(OUTPUT)/test_xdp_noinline.o: CLANG_FLAGS += -fno-inline
 
+$(OUTPUT)/test_queue_map.o: test_queue_stack_map.h
+$(OUTPUT)/test_stack_map.o: test_queue_stack_map.h
+
 BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
 BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
 BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 
'usage.*llvm')
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h 
b/tools/testing/selftests/bpf/bpf_helpers.h
index e4be7730222d..bdbe8f84023e 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -16,6 +16,13 @@ static int (*bpf_map_update_elem)(void *map, void *key, void 
*value,
(void *) BPF_FUNC_map_update_elem;
 static int (*bpf_map_delete_elem)(void *map, void *key) =
(void *) BPF_FUNC_map_delete_elem;
+static int (*bpf_map_push_elem)(void *map, const void *value, int len,
+   unsigned long long flags) =
+   (void *) BPF_FUNC_map_push_elem;
+static int (*bpf_map_pop_elem)(void *map, void *value, int len) =
+   (void *) BPF_FUNC_map_pop_elem;
+static int (*bpf_map_peek_elem)(void *map, void *value, int len) =
+   (void *) BPF_FUNC_map_peek_elem;
 static int 

[RFC PATCH bpf-next v3 5/7] bpf: restrict use of peek/push/pop

2018-09-17 Thread Mauricio Vasquez B
Restrict the use of peek, push and pop helpers only to queue and stack
maps.

Signed-off-by: Mauricio Vasquez B 
---
 kernel/bpf/verifier.c |   14 ++
 1 file changed, 14 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b9e005188f0e..1628ffe48e32 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2084,6 +2084,13 @@ static int check_map_func_compatibility(struct 
bpf_verifier_env *env,
if (func_id != BPF_FUNC_sk_select_reuseport)
goto error;
break;
+   case BPF_MAP_TYPE_QUEUE:
+   case BPF_MAP_TYPE_STACK:
+   if (func_id != BPF_FUNC_map_peek_elem &&
+   func_id != BPF_FUNC_map_pop_elem &&
+   func_id != BPF_FUNC_map_push_elem)
+   goto error;
+   break;
default:
break;
}
@@ -2139,6 +2146,13 @@ static int check_map_func_compatibility(struct 
bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY)
goto error;
break;
+   case BPF_FUNC_map_peek_elem:
+   case BPF_FUNC_map_pop_elem:
+   case BPF_FUNC_map_push_elem:
+   if (map->map_type != BPF_MAP_TYPE_QUEUE &&
+   map->map_type != BPF_MAP_TYPE_STACK)
+   goto error;
+   break;
default:
break;
}



[RFC PATCH bpf-next v3 4/7] bpf: add bpf queue and stack maps

2018-09-17 Thread Mauricio Vasquez B
Implement two new kind of maps that support the peek, push and pop
operations.

A use case for this is to keep track of a pool of elements, like
network ports in a SNAT.

Signed-off-by: Mauricio Vasquez B 
---
 include/linux/bpf.h   |3 
 include/linux/bpf_types.h |2 
 include/uapi/linux/bpf.h  |   30 
 kernel/bpf/Makefile   |2 
 kernel/bpf/core.c |3 
 kernel/bpf/helpers.c  |   98 ++
 kernel/bpf/queue_stack_maps.c |  291 +
 kernel/bpf/verifier.c |5 +
 net/core/filter.c |6 +
 9 files changed, 437 insertions(+), 3 deletions(-)
 create mode 100644 kernel/bpf/queue_stack_maps.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c63a44381d3f..8e924b5c5a0e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -807,6 +807,9 @@ static inline int bpf_fd_reuseport_array_update_elem(struct 
bpf_map *map,
 extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
+extern const struct bpf_func_proto bpf_map_push_elem_proto;
+extern const struct bpf_func_proto bpf_map_pop_elem_proto;
+extern const struct bpf_func_proto bpf_map_peek_elem_proto;
 
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 33f7f574b983..903a446f14c3 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -67,3 +67,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops)
 #endif
 #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4cda584c6640..c899386dcb2b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -128,6 +128,8 @@ enum bpf_map_type {
BPF_MAP_TYPE_SOCKHASH,
BPF_MAP_TYPE_CGROUP_STORAGE,
BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+   BPF_MAP_TYPE_QUEUE,
+   BPF_MAP_TYPE_STACK,
 };
 
 enum bpf_prog_type {
@@ -460,6 +462,29 @@ union bpf_attr {
  * Return
  * 0 on success, or a negative error in case of failure.
  *
+ * int bpf_map_push_elem(struct bpf_map *map, const void *value, u32 len,
+ *  u64 flags)
+ * Description
+ * Push an element *value* in *map*. *flags* is one of:
+ *
+ * **BPF_EXIST**
+ * If the queue/stack is full, the oldest element is removed to
+ * make room for this.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_pop_elem(struct bpf_map *map, void *value, u32 len)
+ * Description
+ * Pop an element from *map*.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_peek_elem(struct bpf_map *map, void *value, u32 len)
+ * Description
+ * Get an element from *map* without removing it.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
  * int bpf_probe_read(void *dst, u32 size, const void *src)
  * Description
  * For tracing programs, safely attempt to read *size* bytes from
@@ -2227,7 +2252,10 @@ union bpf_attr {
FN(get_current_cgroup_id),  \
FN(get_local_storage),  \
FN(sk_select_reuseport),\
-   FN(skb_ancestor_cgroup_id),
+   FN(skb_ancestor_cgroup_id), \
+   FN(map_push_elem),  \
+   FN(map_pop_elem),   \
+   FN(map_peek_elem),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e656bce87c8f..2d77bc5b2aca 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,7 +3,7 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o 
bpf_lru_list.o lpm_trie.o map_in_map.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
 ifeq ($(CONFIG_NET),y)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3f5bf1af0826..8d2db076d123 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1783,6 +1783,9 @@ BPF_CALL_0(bpf_user_rnd_u32)
 const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
 const struct bpf_func_proto bpf_map_update_elem_proto __weak;
 const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
+const struct bpf_func_proto bpf_map_push_elem_proto __weak;
+const struct bpf_func_proto 

[RFC PATCH bpf-next v3 6/7] Sync uapi/bpf.h to tools/include

2018-09-17 Thread Mauricio Vasquez B
Sync both files.

Signed-off-by: Mauricio Vasquez B 
---
 tools/include/uapi/linux/bpf.h |   31 ++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 66917a4eba27..c899386dcb2b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -103,6 +103,7 @@ enum bpf_cmd {
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
BPF_TASK_FD_QUERY,
+   BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 };
 
 enum bpf_map_type {
@@ -127,6 +128,8 @@ enum bpf_map_type {
BPF_MAP_TYPE_SOCKHASH,
BPF_MAP_TYPE_CGROUP_STORAGE,
BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+   BPF_MAP_TYPE_QUEUE,
+   BPF_MAP_TYPE_STACK,
 };
 
 enum bpf_prog_type {
@@ -459,6 +462,29 @@ union bpf_attr {
  * Return
  * 0 on success, or a negative error in case of failure.
  *
+ * int bpf_map_push_elem(struct bpf_map *map, const void *value, u32 len,
+ *  u64 flags)
+ * Description
+ * Push an element *value* in *map*. *flags* is one of:
+ *
+ * **BPF_EXIST**
+ * If the queue/stack is full, the oldest element is removed to
+ * make room for this.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_pop_elem(struct bpf_map *map, void *value, u32 len)
+ * Description
+ * Pop an element from *map*.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_peek_elem(struct bpf_map *map, void *value, u32 len)
+ * Description
+ * Get an element from *map* without removing it.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
  * int bpf_probe_read(void *dst, u32 size, const void *src)
  * Description
  * For tracing programs, safely attempt to read *size* bytes from
@@ -2226,7 +2252,10 @@ union bpf_attr {
FN(get_current_cgroup_id),  \
FN(get_local_storage),  \
FN(sk_select_reuseport),\
-   FN(skb_ancestor_cgroup_id),
+   FN(skb_ancestor_cgroup_id), \
+   FN(map_push_elem),  \
+   FN(map_pop_elem),   \
+   FN(map_peek_elem),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call



[RFC PATCH bpf-next v3 0/7] Implement bpf queue/stack maps

2018-09-17 Thread Mauricio Vasquez B
In some applications this is needed have a pool of free elements, like for
example the list of free L4 ports in a SNAT.  None of the current maps allow
to do it as it is not possibleto get an any element without having they key
it is associated to.

This patchset implements two new kind of eBPF maps: queue and stack.
Those maps provide to eBPF programs the peek, push and pop operations, and for
userspace applications a new bpf_map_lookup_and_delete_elem() is added.

Signed-off-by: Mauricio Vasquez B 

v2 -> v3:
 - Return elements by value instead that by reference
 - Implement queue/stack base on array and head + tail indexes
 - Rename stack trace related files to avoid confusion and conflicts

v1 -> v2:
 - Create two separate maps instead of single one + flags
 - Implement bpf_map_lookup_and_delete syscall
 - Support peek operation
 - Define replacement policy through flags in the update() method
 - Add eBPF side tests

---

Mauricio Vasquez B (7):
  bpf: rename stack trace map
  bpf/syscall: allow key to be null in map functions
  bpf: add lookup_and_delete map operation
  bpf: add bpf queue and stack maps
  bpf: restrict use of peek/push/pop
  Sync uapi/bpf.h to tools/include
  selftests/bpf: add test cases for queue and stack maps


 include/linux/bpf.h|4 
 include/linux/bpf_types.h  |4 
 include/uapi/linux/bpf.h   |   31 +
 kernel/bpf/Makefile|4 
 kernel/bpf/core.c  |3 
 kernel/bpf/helpers.c   |   98 +++
 kernel/bpf/queue_stack_maps.c  |  291 +
 kernel/bpf/stackmap.c  |  624 
 kernel/bpf/stacktracemap.c |  624 
 kernel/bpf/syscall.c   |  101 +++
 kernel/bpf/verifier.c  |   19 +
 net/core/filter.c  |6 
 tools/include/uapi/linux/bpf.h |   31 +
 tools/lib/bpf/bpf.c|   12 
 tools/lib/bpf/bpf.h|1 
 tools/testing/selftests/bpf/Makefile   |5 
 tools/testing/selftests/bpf/bpf_helpers.h  |7 
 tools/testing/selftests/bpf/test_maps.c|  130 
 tools/testing/selftests/bpf/test_progs.c   |   99 +++
 tools/testing/selftests/bpf/test_queue_map.c   |4 
 tools/testing/selftests/bpf/test_queue_stack_map.h |   59 ++
 tools/testing/selftests/bpf/test_stack_map.c   |4 
 22 files changed, 1526 insertions(+), 635 deletions(-)
 create mode 100644 kernel/bpf/queue_stack_maps.c
 delete mode 100644 kernel/bpf/stackmap.c
 create mode 100644 kernel/bpf/stacktracemap.c
 create mode 100644 tools/testing/selftests/bpf/test_queue_map.c
 create mode 100644 tools/testing/selftests/bpf/test_queue_stack_map.h
 create mode 100644 tools/testing/selftests/bpf/test_stack_map.c

--



[RFC PATCH bpf-next v3 1/7] bpf: rename stack trace map

2018-09-17 Thread Mauricio Vasquez B
In the following patches queue and stack maps (FIFO and LIFO
datastructures) will be implemented.  In order to avoid confusion and
a possible name clash rename stackmap.c to stacktracemap.c and
stack_map_ops to stack_trace_map_ops

Signed-off-by: Mauricio Vasquez B 
---
 include/linux/bpf_types.h  |2 
 kernel/bpf/Makefile|2 
 kernel/bpf/stackmap.c  |  624 
 kernel/bpf/stacktracemap.c |  624 
 4 files changed, 626 insertions(+), 626 deletions(-)
 delete mode 100644 kernel/bpf/stackmap.c
 create mode 100644 kernel/bpf/stacktracemap.c

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index cd26c090e7c0..33f7f574b983 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -49,7 +49,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_PERCPU_HASH, htab_lru_percpu_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_LPM_TRIE, trie_map_ops)
 #ifdef CONFIG_PERF_EVENTS
-BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_trace_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0488b8258321..e656bce87c8f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -20,7 +20,7 @@ endif
 endif
 endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
-obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
+obj-$(CONFIG_BPF_SYSCALL) += stacktracemap.o
 endif
 obj-$(CONFIG_CGROUP_BPF) += cgroup.o
 ifeq ($(CONFIG_INET),y)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
deleted file mode 100644
index 8061a439ef18..
--- a/kernel/bpf/stackmap.c
+++ /dev/null
@@ -1,624 +0,0 @@
-/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- */
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include "percpu_freelist.h"
-
-#define STACK_CREATE_FLAG_MASK \
-   (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |\
-BPF_F_STACK_BUILD_ID)
-
-struct stack_map_bucket {
-   struct pcpu_freelist_node fnode;
-   u32 hash;
-   u32 nr;
-   u64 data[];
-};
-
-struct bpf_stack_map {
-   struct bpf_map map;
-   void *elems;
-   struct pcpu_freelist freelist;
-   u32 n_buckets;
-   struct stack_map_bucket *buckets[];
-};
-
-/* irq_work to run up_read() for build_id lookup in nmi context */
-struct stack_map_irq_work {
-   struct irq_work irq_work;
-   struct rw_semaphore *sem;
-};
-
-static void do_up_read(struct irq_work *entry)
-{
-   struct stack_map_irq_work *work;
-
-   work = container_of(entry, struct stack_map_irq_work, irq_work);
-   up_read(work->sem);
-   work->sem = NULL;
-}
-
-static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
-
-static inline bool stack_map_use_build_id(struct bpf_map *map)
-{
-   return (map->map_flags & BPF_F_STACK_BUILD_ID);
-}
-
-static inline int stack_map_data_size(struct bpf_map *map)
-{
-   return stack_map_use_build_id(map) ?
-   sizeof(struct bpf_stack_build_id) : sizeof(u64);
-}
-
-static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
-{
-   u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
-   int err;
-
-   smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
-smap->map.numa_node);
-   if (!smap->elems)
-   return -ENOMEM;
-
-   err = pcpu_freelist_init(>freelist);
-   if (err)
-   goto free_elems;
-
-   pcpu_freelist_populate(>freelist, smap->elems, elem_size,
-  smap->map.max_entries);
-   return 0;
-
-free_elems:
-   bpf_map_area_free(smap->elems);
-   return err;
-}
-
-/* Called from syscall */
-static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
-{
-   u32 value_size = attr->value_size;
-   struct bpf_stack_map *smap;
-   u64 cost, n_buckets;
-   int err;
-
-   if (!capable(CAP_SYS_ADMIN))
-   return ERR_PTR(-EPERM);
-
-   if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
-   return ERR_PTR(-EINVAL);
-
-   /* check sanity of attributes */
-   if (attr->max_entries == 0 || attr->key_size != 4 ||
-   value_size < 8 || value_size % 8)
-   return ERR_PTR(-EINVAL);
-
-   BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
-   if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
-   if (value_size % sizeof(struct bpf_stack_build_id) ||
-   value_size / sizeof(struct bpf_stack_build_id)
-   > 

[RFC PATCH bpf-next v3 2/7] bpf/syscall: allow key to be null in map functions

2018-09-17 Thread Mauricio Vasquez B
This commit adds the required logic to allow key being NULL
in case the key_size of the map is 0.

A new __bpf_copy_key function helper only copies the key from
userpsace when key_size != 0, otherwise it enforces that key must be
null.

Signed-off-by: Mauricio Vasquez B 
---
 kernel/bpf/syscall.c |   19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3c9636f03bb2..f2d4e4f280dc 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -651,6 +651,17 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void 
*key, void *value)
return -ENOTSUPP;
 }
 
+static void *__bpf_copy_key(void __user *ukey, u64 key_size)
+{
+   if (key_size)
+   return memdup_user(ukey, key_size);
+
+   if (ukey)
+   return ERR_PTR(-EINVAL);
+
+   return NULL;
+}
+
 /* last field in 'union bpf_attr' used by this command */
 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
 
@@ -678,7 +689,7 @@ static int map_lookup_elem(union bpf_attr *attr)
goto err_put;
}
 
-   key = memdup_user(ukey, map->key_size);
+   key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
@@ -766,7 +777,7 @@ static int map_update_elem(union bpf_attr *attr)
goto err_put;
}
 
-   key = memdup_user(ukey, map->key_size);
+   key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
@@ -864,7 +875,7 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}
 
-   key = memdup_user(ukey, map->key_size);
+   key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
@@ -916,7 +927,7 @@ static int map_get_next_key(union bpf_attr *attr)
}
 
if (ukey) {
-   key = memdup_user(ukey, map->key_size);
+   key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;



[RFC PATCH bpf-next v3 3/7] bpf: add lookup_and_delete map operation

2018-09-17 Thread Mauricio Vasquez B
The following patch implements a bpf queue/stack maps that
provides the peek/pop/push functions.  There is not a direct
relationship between those functions and the current operations
supported by a map, hence a new lookup_and_delete map operation
is added, this operation would be used by the pop helper.

A pop operation is not added because it will too specific to
stack/queue maps, instead this new operation could be useful
for other maps as well.

Signed-off-by: Mauricio Vasquez B 
---
 include/linux/bpf.h  |1 +
 include/uapi/linux/bpf.h |1 +
 kernel/bpf/syscall.c |   82 ++
 3 files changed, 84 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 523481a3471b..c63a44381d3f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,6 +39,7 @@ struct bpf_map_ops {
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 
flags);
int (*map_delete_elem)(struct bpf_map *map, void *key);
+   void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void *key);
 
/* funcs called by prog_array and perf_event_array map */
void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 66917a4eba27..4cda584c6640 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -103,6 +103,7 @@ enum bpf_cmd {
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
BPF_TASK_FD_QUERY,
+   BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 };
 
 enum bpf_map_type {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f2d4e4f280dc..7d429123a298 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -968,6 +968,85 @@ static int map_get_next_key(union bpf_attr *attr)
return err;
 }
 
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+
+static int map_lookup_and_delete_elem(union bpf_attr *attr)
+{
+   void __user *ukey = u64_to_user_ptr(attr->key);
+   void __user *uvalue = u64_to_user_ptr(attr->value);
+   int ufd = attr->map_fd;
+   struct bpf_map *map;
+   void *key, *value, *ptr;
+   u32 value_size;
+   struct fd f;
+   int err;
+
+   if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
+   return -EINVAL;
+
+   f = fdget(ufd);
+   map = __bpf_map_get(f);
+   if (IS_ERR(map))
+   return PTR_ERR(map);
+
+   if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+   err = -EPERM;
+   goto err_put;
+   }
+
+   if (!map->ops->map_lookup_and_delete_elem) {
+   err = -ENOTSUPP;
+   goto err_put;
+   }
+
+   key = __bpf_copy_key(ukey, map->key_size);
+   if (IS_ERR(key)) {
+   err = PTR_ERR(key);
+   goto err_put;
+   }
+
+   value_size = map->value_size;
+
+   err = -ENOMEM;
+   value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+   if (!value)
+   goto free_key;
+
+   err = -EFAULT;
+   if (copy_from_user(value, uvalue, value_size) != 0)
+   goto free_value;
+
+   /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+* inside bpf map update or delete otherwise deadlocks are possible
+*/
+   preempt_disable();
+   __this_cpu_inc(bpf_prog_active);
+   rcu_read_lock();
+   ptr = map->ops->map_lookup_and_delete_elem(map, key);
+   if (ptr)
+   memcpy(value, ptr, value_size);
+   rcu_read_unlock();
+   err = ptr ? 0 : -ENOENT;
+   __this_cpu_dec(bpf_prog_active);
+   preempt_enable();
+
+   if (err)
+   goto free_value;
+
+   if (copy_to_user(uvalue, value, value_size) != 0)
+   goto free_value;
+
+   err = 0;
+
+free_value:
+   kfree(value);
+free_key:
+   kfree(key);
+err_put:
+   fdput(f);
+   return err;
+}
+
 static const struct bpf_prog_ops * const bpf_prog_types[] = {
 #define BPF_PROG_TYPE(_id, _name) \
[_id] = & _name ## _prog_ops,
@@ -2428,6 +2507,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, 
uattr, unsigned int, siz
case BPF_TASK_FD_QUERY:
err = bpf_task_fd_query(, uattr);
break;
+   case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
+   err = map_lookup_and_delete_elem();
+   break;
default:
err = -EINVAL;
break;



Re: [bpf PATCH 2/3] bpf: sockmap, fix transition through disconnect without close

2018-09-17 Thread John Fastabend
On 09/17/2018 02:09 PM, Y Song wrote:
> On Mon, Sep 17, 2018 at 10:32 AM John Fastabend
>  wrote:
>>
>> It is possible (via shutdown()) for TCP socks to go trough TCP_CLOSE
>> state via tcp_disconnect() without actually calling tcp_close which
>> would then call our bpf_tcp_close() callback. Because of this a user
>> could disconnect a socket then put it in a LISTEN state which would
>> break our assumptions about sockets always being ESTABLISHED state.
>>
>> To resolve this rely on the unhash hook, which is called in the
>> disconnect case, to remove the sock from the sockmap.
>>
>> Reported-by: Eric Dumazet 
>> Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
>> Signed-off-by: John Fastabend 
>> ---

[...]

>> +{
>> +   void (*unhash_fun)(struct sock *sk);
>> +   struct smap_psock *psock;
>> +
>> +   rcu_read_lock();
>> +   psock = smap_psock_sk(sk);
>> +   if (unlikely(!psock)) {
>> +   rcu_read_unlock();
>> +   release_sock(sk);
> 
> Can socket be released here?
>

Right, it was an error (it can not be released) fixed in v3.


>> +   return sk->sk_prot->unhash(sk);
>> +   }
>> +
>> +   /* The psock may be destroyed anytime after exiting the RCU critial
>> +* section so by the time we use close_fun the psock may no longer
>> +* be valid. However, bpf_tcp_close is called with the sock lock
>> +* held so the close hook and sk are still valid.
>> +*/
> 
> the comments above are not correct. A copy-paste mistake?

I just removed the comments they are not overly helpful at this point
and the commit msg is more useful anyways.

Thanks,
John


Re: [bpf PATCH 3/3] bpf: test_maps, only support ESTABLISHED socks

2018-09-17 Thread John Fastabend
On 09/17/2018 02:21 PM, Y Song wrote:
> On Mon, Sep 17, 2018 at 10:33 AM John Fastabend
>  wrote:
>>
>> Ensure that sockets added to a sock{map|hash} that is not in the
>> ESTABLISHED state is rejected.
>>
>> Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
>> Signed-off-by: John Fastabend 
>> ---
>>  tools/testing/selftests/bpf/test_maps.c |   10 +++---
>>  1 file changed, 7 insertions(+), 3 deletions(-)
>>
>> diff --git a/tools/testing/selftests/bpf/test_maps.c 
>> b/tools/testing/selftests/bpf/test_maps.c
>> index 6f54f84..0f2090f 100644
>> --- a/tools/testing/selftests/bpf/test_maps.c
>> +++ b/tools/testing/selftests/bpf/test_maps.c
>> @@ -580,7 +580,11 @@ static void test_sockmap(int tasks, void *data)
>> /* Test update without programs */
>> for (i = 0; i < 6; i++) {
>> err = bpf_map_update_elem(fd, , [i], BPF_ANY);
>> -   if (err) {
>> +   if (i < 2 && !err) {
>> +   printf("Allowed update sockmap '%i:%i' not in 
>> ESTABLISHED\n",
>> +  i, sfd[i]);
>> +   goto out_sockmap;
>> +   } else if (i > 1 && err) {
> 
> Just a nit. Maybe "i >= 2" since it will be more clear since it is
> opposite of "i < 2"?
> 

Seems reasonable changed in v3 to 'i >= 2'. Thanks.


[bpf PATCH v3 1/3] bpf: sockmap only allow ESTABLISHED sock state

2018-09-17 Thread John Fastabend
After this patch we only allow socks that are in ESTABLISHED state or
are being added via a sock_ops event that is transitioning into an
ESTABLISHED state. By allowing sock_ops events we allow users to
manage sockmaps directly from sock ops programs. The two supported
sock_ops ops are BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB and
BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB.

Similar to TLS ULP this ensures sk_user_data is correct.

Reported-by: Eric Dumazet 
Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
Signed-off-by: John Fastabend 
Acked-by: Yonghong Song 
---
 0 files changed

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 488ef96..1f97b55 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -2097,8 +2097,12 @@ static int sock_map_update_elem(struct bpf_map *map,
return -EINVAL;
}
 
+   /* ULPs are currently supported only for TCP sockets in ESTABLISHED
+* state.
+*/
if (skops.sk->sk_type != SOCK_STREAM ||
-   skops.sk->sk_protocol != IPPROTO_TCP) {
+   skops.sk->sk_protocol != IPPROTO_TCP ||
+   skops.sk->sk_state != TCP_ESTABLISHED) {
fput(socket->file);
return -EOPNOTSUPP;
}
@@ -2453,6 +2457,16 @@ static int sock_hash_update_elem(struct bpf_map *map,
return -EINVAL;
}
 
+   /* ULPs are currently supported only for TCP sockets in ESTABLISHED
+* state.
+*/
+   if (skops.sk->sk_type != SOCK_STREAM ||
+   skops.sk->sk_protocol != IPPROTO_TCP ||
+   skops.sk->sk_state != TCP_ESTABLISHED) {
+   fput(socket->file);
+   return -EOPNOTSUPP;
+   }
+
lock_sock(skops.sk);
preempt_disable();
rcu_read_lock();
@@ -2543,10 +2557,22 @@ struct sock  *__sock_hash_lookup_elem(struct bpf_map 
*map, void *key)
.map_check_btf = map_check_no_btf,
 };
 
+static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops)
+{
+   return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
+  ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
+}
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
   struct bpf_map *, map, void *, key, u64, flags)
 {
WARN_ON_ONCE(!rcu_read_lock_held());
+
+   /* ULPs are currently supported only for TCP sockets in ESTABLISHED
+* state. This checks that the sock ops triggering the update is
+* one indicating we are (or will be soon) in an ESTABLISHED state.
+*/
+   if (!bpf_is_valid_sock_op(bpf_sock))
+   return -EOPNOTSUPP;
return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
 }
 
@@ -2565,6 +2591,9 @@ struct sock  *__sock_hash_lookup_elem(struct bpf_map 
*map, void *key)
   struct bpf_map *, map, void *, key, u64, flags)
 {
WARN_ON_ONCE(!rcu_read_lock_held());
+
+   if (!bpf_is_valid_sock_op(bpf_sock))
+   return -EOPNOTSUPP;
return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
 }
 



[bpf PATCH v3 3/3] bpf: test_maps, only support ESTABLISHED socks

2018-09-17 Thread John Fastabend
Ensure that sockets added to a sock{map|hash} that is not in the
ESTABLISHED state is rejected.

Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
Signed-off-by: John Fastabend 
---
 tools/testing/selftests/bpf/test_maps.c |   10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_maps.c 
b/tools/testing/selftests/bpf/test_maps.c
index 6f54f84..9b552c0 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -580,7 +580,11 @@ static void test_sockmap(int tasks, void *data)
/* Test update without programs */
for (i = 0; i < 6; i++) {
err = bpf_map_update_elem(fd, , [i], BPF_ANY);
-   if (err) {
+   if (i < 2 && !err) {
+   printf("Allowed update sockmap '%i:%i' not in 
ESTABLISHED\n",
+  i, sfd[i]);
+   goto out_sockmap;
+   } else if (i >= 2 && err) {
printf("Failed noprog update sockmap '%i:%i'\n",
   i, sfd[i]);
goto out_sockmap;
@@ -741,7 +745,7 @@ static void test_sockmap(int tasks, void *data)
}
 
/* Test map update elem afterwards fd lives in fd and map_fd */
-   for (i = 0; i < 6; i++) {
+   for (i = 2; i < 6; i++) {
err = bpf_map_update_elem(map_fd_rx, , [i], BPF_ANY);
if (err) {
printf("Failed map_fd_rx update sockmap %i '%i:%i'\n",
@@ -845,7 +849,7 @@ static void test_sockmap(int tasks, void *data)
}
 
/* Delete the elems without programs */
-   for (i = 0; i < 6; i++) {
+   for (i = 2; i < 6; i++) {
err = bpf_map_delete_elem(fd, );
if (err) {
printf("Failed delete sockmap %i '%i:%i'\n",



[bpf PATCH v3 2/3] bpf: sockmap, fix transition through disconnect without close

2018-09-17 Thread John Fastabend
It is possible (via shutdown()) for TCP socks to go trough TCP_CLOSE
state via tcp_disconnect() without actually calling tcp_close which
would then call our bpf_tcp_close() callback. Because of this a user
could disconnect a socket then put it in a LISTEN state which would
break our assumptions about sockets always being ESTABLISHED state.

To resolve this rely on the unhash hook, which is called in the
disconnect case, to remove the sock from the sockmap.

Reported-by: Eric Dumazet 
Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
Signed-off-by: John Fastabend 
---
 0 files changed

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 1f97b55..5680d65 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -132,6 +132,7 @@ struct smap_psock {
struct work_struct gc_work;
 
struct proto *sk_proto;
+   void (*save_unhash)(struct sock *sk);
void (*save_close)(struct sock *sk, long timeout);
void (*save_data_ready)(struct sock *sk);
void (*save_write_space)(struct sock *sk);
@@ -143,6 +144,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr 
*msg, size_t len,
 static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
+static void bpf_tcp_unhash(struct sock *sk);
 static void bpf_tcp_close(struct sock *sk, long timeout);
 
 static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
@@ -184,6 +186,7 @@ static void build_protos(struct proto 
prot[SOCKMAP_NUM_CONFIGS],
 struct proto *base)
 {
prot[SOCKMAP_BASE]  = *base;
+   prot[SOCKMAP_BASE].unhash   = bpf_tcp_unhash;
prot[SOCKMAP_BASE].close= bpf_tcp_close;
prot[SOCKMAP_BASE].recvmsg  = bpf_tcp_recvmsg;
prot[SOCKMAP_BASE].stream_memory_read   = bpf_tcp_stream_read;
@@ -217,6 +220,7 @@ static int bpf_tcp_init(struct sock *sk)
return -EBUSY;
}
 
+   psock->save_unhash = sk->sk_prot->unhash;
psock->save_close = sk->sk_prot->close;
psock->sk_proto = sk->sk_prot;
 
@@ -305,30 +309,12 @@ static struct smap_psock_map_entry *psock_map_pop(struct 
sock *sk,
return e;
 }
 
-static void bpf_tcp_close(struct sock *sk, long timeout)
+static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
 {
-   void (*close_fun)(struct sock *sk, long timeout);
struct smap_psock_map_entry *e;
struct sk_msg_buff *md, *mtmp;
-   struct smap_psock *psock;
struct sock *osk;
 
-   lock_sock(sk);
-   rcu_read_lock();
-   psock = smap_psock_sk(sk);
-   if (unlikely(!psock)) {
-   rcu_read_unlock();
-   release_sock(sk);
-   return sk->sk_prot->close(sk, timeout);
-   }
-
-   /* The psock may be destroyed anytime after exiting the RCU critial
-* section so by the time we use close_fun the psock may no longer
-* be valid. However, bpf_tcp_close is called with the sock lock
-* held so the close hook and sk are still valid.
-*/
-   close_fun = psock->save_close;
-
if (psock->cork) {
free_start_sg(psock->sock, psock->cork, true);
kfree(psock->cork);
@@ -379,6 +365,42 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
kfree(e);
e = psock_map_pop(sk, psock);
}
+}
+
+static void bpf_tcp_unhash(struct sock *sk)
+{
+   void (*unhash_fun)(struct sock *sk);
+   struct smap_psock *psock;
+
+   rcu_read_lock();
+   psock = smap_psock_sk(sk);
+   if (unlikely(!psock)) {
+   rcu_read_unlock();
+   if (sk->sk_prot->unhash)
+   return sk->sk_prot->unhash(sk);
+   return;
+   }
+   unhash_fun = psock->save_unhash;
+   bpf_tcp_remove(sk, psock);
+   rcu_read_unlock();
+   unhash_fun(sk);
+}
+
+static void bpf_tcp_close(struct sock *sk, long timeout)
+{
+   void (*close_fun)(struct sock *sk, long timeout);
+   struct smap_psock *psock;
+
+   lock_sock(sk);
+   rcu_read_lock();
+   psock = smap_psock_sk(sk);
+   if (unlikely(!psock)) {
+   rcu_read_unlock();
+   release_sock(sk);
+   return sk->sk_prot->close(sk, timeout);
+   }
+   close_fun = psock->save_close;
+   bpf_tcp_remove(sk, psock);
rcu_read_unlock();
release_sock(sk);
close_fun(sk, timeout);



[bpf PATCH v3 0/3] bpf, sockmap ESTABLISHED state only

2018-09-17 Thread John Fastabend
Eric noted that using the close callback is not sufficient
to catch all transitions from ESTABLISHED state to a LISTEN
state. So this series does two things. First, only allow
adding socks in ESTABLISH state and second use unhash callback
to catch tcp_disconnect() transitions.

v2: added check for ESTABLISH state in hash update sockmap as well
v3: Do not release lock from unhash in error path, no lock was
used in the first place. And drop not so useful code comments

Thanks for reviewing Yonghong I carried your ACK forward
on patch 1/3.

Thanks,
John

---

John Fastabend (3):
  bpf: sockmap only allow ESTABLISHED sock state
  bpf: sockmap, fix transition through disconnect without close
  bpf: test_maps, only support ESTABLISHED socks


 tools/testing/selftests/bpf/test_maps.c |   10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

--
Signature


[pull request][RESEND net 0/3] Mellanox, mlx5 fixes 2018-09-17

2018-09-17 Thread Saeed Mahameed
Hi Dave,

Sorry about the previous submission of this series which was mistakenly
marked for net-next, here I am resending with 'net' mark.

This series provides three fixes to mlx5 core and mlx5e netdevice
driver.

Please pull and let me know if there's any problem.

For -stable v4.16:
('net/mlx5: Check for SQ and not RQ state when modifying hairpin SQ')

Thanks,
Saeed.

---

The following changes since commit c73480910e9686a5c25155cb4d418d594b678196:

  net: ethernet: Fix a unused function warning. (2018-09-17 08:24:25 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
tags/mlx5-fixes-2018-09-17

for you to fetch changes up to 8f92e35aff9692028279d3c03e88547df6d15020:

  net/mlx5e: TLS, Read capabilities only when it is safe (2018-09-17 15:12:31 
-0700)


mlx5-fixes-2018-09-17


Alaa Hleihel (1):
  net/mlx5: Check for SQ and not RQ state when modifying hairpin SQ

Eli Cohen (1):
  net/mlx5: Fix read from coherent memory

Saeed Mahameed (1):
  net/mlx5e: TLS, Read capabilities only when it is safe

 drivers/net/ethernet/mellanox/mlx5/core/cmd.c  | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)


[RESEND net 2/3] net/mlx5: Check for SQ and not RQ state when modifying hairpin SQ

2018-09-17 Thread Saeed Mahameed
From: Alaa Hleihel 

When modifying hairpin SQ, instead of checking if the next state equals
to MLX5_SQC_STATE_RDY, we compare it against the MLX5_RQC_STATE_RDY enum
value.

The code worked since both of MLX5_RQC_STATE_RDY and MLX5_SQC_STATE_RDY
have the same value today.

This patch fixes this issue.

Fixes: 18e568c390c6 ("net/mlx5: Hairpin pair core object setup")
Signed-off-by: Alaa Hleihel 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c 
b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
index dae1c5c5d27c..d2f76070ea7c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -509,7 +509,7 @@ static int mlx5_hairpin_modify_sq(struct mlx5_core_dev 
*peer_mdev, u32 sqn,
 
sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
 
-   if (next_state == MLX5_RQC_STATE_RDY) {
+   if (next_state == MLX5_SQC_STATE_RDY) {
MLX5_SET(sqc, sqc, hairpin_peer_rq, peer_rq);
MLX5_SET(sqc, sqc, hairpin_peer_vhca, peer_vhca);
}
-- 
2.17.1



[RESEND net 3/3] net/mlx5e: TLS, Read capabilities only when it is safe

2018-09-17 Thread Saeed Mahameed
Read TLS caps from the core driver only when TLS is supported, i.e
mlx5_accel_is_tls_device returns true.

Fixes: 790af90c00d2 ("net/mlx5e: TLS, build TLS netdev from capabilities")
Reported-by: Michal Kubecek 
Signed-off-by: Saeed Mahameed 
Reviewed-by: Boris Pismenny 
Reviewed-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
index eddd7702680b..e88340e196f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
@@ -183,12 +183,13 @@ static const struct tlsdev_ops mlx5e_tls_ops = {
 
 void mlx5e_tls_build_netdev(struct mlx5e_priv *priv)
 {
-   u32 caps = mlx5_accel_tls_device_caps(priv->mdev);
struct net_device *netdev = priv->netdev;
+   u32 caps;
 
if (!mlx5_accel_is_tls_device(priv->mdev))
return;
 
+   caps = mlx5_accel_tls_device_caps(priv->mdev);
if (caps & MLX5_ACCEL_TLS_TX) {
netdev->features  |= NETIF_F_HW_TLS_TX;
netdev->hw_features   |= NETIF_F_HW_TLS_TX;
-- 
2.17.1



[RESEND net 1/3] net/mlx5: Fix read from coherent memory

2018-09-17 Thread Saeed Mahameed
From: Eli Cohen 

Use accessor function READ_ONCE to read from coherent memory modified
by the device and read by the driver. This becomes most important in
preemptive kernels where cond_resched implementation does not have the
side effect which guaranteed the updated value.

Fixes: 269d26f47f6f ("net/mlx5: Reduce command polling interval")
Signed-off-by: Eli Cohen 
Reported-by: Jesper Dangaard Brouer 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c 
b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 3ce14d42ddc8..a53736c26c0c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -206,7 +206,7 @@ static void poll_timeout(struct mlx5_cmd_work_ent *ent)
u8 own;
 
do {
-   own = ent->lay->status_own;
+   own = READ_ONCE(ent->lay->status_own);
if (!(own & CMD_OWNER_HW)) {
ent->ret = 0;
return;
-- 
2.17.1



Re: [pull request][net-next 0/3] Mellanox, mlx5 fixes 2018-09-17

2018-09-17 Thread Saeed Mahameed
On Mon, Sep 17, 2018 at 7:37 PM David Miller  wrote:
>
> From: Saeed Mahameed 
> Date: Mon, 17 Sep 2018 17:01:58 -0700
>
> > This series provides three fixes to mlx5 core and mlx5e netdevice
> > driver.
> >
> > Please pull and let me know if there's any problem.
> >
> > For -stable v4.16:
> > ('net/mlx5: Check for SQ and not RQ state when modifying hairpin SQ')
>
> Patches need to target 'net' in order to be queued up for -stable.
>
> It is not appropriate to submit patchs for 'net-next' and ask for
> those to be submitted to -stable.
>

oops, they are mean for net, i will resubmit.
Sorry.

> Thank you.


Re: [Patch net v2] net/ipv6: do not copy dst flags on rt init

2018-09-17 Thread David Miller
From: Peter Oskolkov 
Date: Mon, 17 Sep 2018 10:20:53 -0700

> DST_NOCOUNT in dst_entry::flags tracks whether the entry counts
> toward route cache size (net->ipv6.sysctl.ip6_rt_max_size).
> 
> If the flag is NOT set, dst_ops::pcpuc_entries counter is incremented
> in dist_init() and decremented in dst_destroy().
> 
> This flag is tied to allocation/deallocation of dst_entry and
> should not be copied from another dst/route. Otherwise it can happen
> that dst_ops::pcpuc_entries counter grows until no new routes can
> be allocated because the counter reached ip6_rt_max_size due to
> DST_NOCOUNT not set and thus no counter decrements on gc-ed routes.
> 
> Fixes: 3b6761d18bc1 ("net/ipv6: Move dst flags to booleans in fib entries")
> Cc: David Ahern 
> Acked-by: Wei Wang 
> Signed-off-by: Peter Oskolkov 

Applied and queued up for -stable, thank you.


Re: [PATCH v2 net] net/ipv4: defensive cipso option parsing

2018-09-17 Thread David Miller
From: Stefan Nuernberger 
Date: Mon, 17 Sep 2018 19:46:53 +0200

> commit 40413955ee26 ("Cipso: cipso_v4_optptr enter infinite loop") fixed
> a possible infinite loop in the IP option parsing of CIPSO. The fix
> assumes that ip_options_compile filtered out all zero length options and
> that no other one-byte options beside IPOPT_END and IPOPT_NOOP exist.
> While this assumption currently holds true, add explicit checks for zero
> length and invalid length options to be safe for the future. Even though
> ip_options_compile should have validated the options, the introduction of
> new one-byte options can still confuse this code without the additional
> checks.
> 
> Signed-off-by: Stefan Nuernberger 

Applied to net-next.

This is not 'net' nor -stable material.  I'm hesitent about this
change as-is, and ip_options_compile() is not changing semantics in
-stable in the way that you say can cause problems.


Re: [pull request][net-next 0/3] Mellanox, mlx5 fixes 2018-09-17

2018-09-17 Thread David Miller
From: Saeed Mahameed 
Date: Mon, 17 Sep 2018 17:01:58 -0700

> This series provides three fixes to mlx5 core and mlx5e netdevice
> driver.
> 
> Please pull and let me know if there's any problem.
> 
> For -stable v4.16:
> ('net/mlx5: Check for SQ and not RQ state when modifying hairpin SQ')

Patches need to target 'net' in order to be queued up for -stable.

It is not appropriate to submit patchs for 'net-next' and ask for
those to be submitted to -stable.

Thank you.


Re: [PATCH] net: caif: remove redundant null check on frontpkt

2018-09-17 Thread David Miller
From: Colin King 
Date: Fri, 14 Sep 2018 18:19:16 +0100

> From: Colin Ian King 
> 
> It is impossible for frontpkt to be null at the point of the null
> check because it has been assigned from rearpkt and there is no
> way realpkt can be null at the point of the assignment because
> of the sanity checking and exit paths taken previously. Remove
> the redundant null check.
> 
> Detected by CoverityScan, CID#114434 ("Logically dead code")
> 
> Signed-off-by: Colin Ian King 

Appied to net-next with typo fixed.

Thanks.


[net-next 2/3] net/mlx5: Check for SQ and not RQ state when modifying hairpin SQ

2018-09-17 Thread Saeed Mahameed
From: Alaa Hleihel 

When modifying hairpin SQ, instead of checking if the next state equals
to MLX5_SQC_STATE_RDY, we compare it against the MLX5_RQC_STATE_RDY enum
value.

The code worked since both of MLX5_RQC_STATE_RDY and MLX5_SQC_STATE_RDY
have the same value today.

This patch fixes this issue.

Fixes: 18e568c390c6 ("net/mlx5: Hairpin pair core object setup")
Signed-off-by: Alaa Hleihel 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c 
b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
index dae1c5c5d27c..d2f76070ea7c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -509,7 +509,7 @@ static int mlx5_hairpin_modify_sq(struct mlx5_core_dev 
*peer_mdev, u32 sqn,
 
sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
 
-   if (next_state == MLX5_RQC_STATE_RDY) {
+   if (next_state == MLX5_SQC_STATE_RDY) {
MLX5_SET(sqc, sqc, hairpin_peer_rq, peer_rq);
MLX5_SET(sqc, sqc, hairpin_peer_vhca, peer_vhca);
}
-- 
2.17.1



[pull request][net-next 0/3] Mellanox, mlx5 fixes 2018-09-17

2018-09-17 Thread Saeed Mahameed
This series provides three fixes to mlx5 core and mlx5e netdevice
driver.

Please pull and let me know if there's any problem.

For -stable v4.16:
('net/mlx5: Check for SQ and not RQ state when modifying hairpin SQ')

Thanks,
Saeed.

---

The following changes since commit c73480910e9686a5c25155cb4d418d594b678196:

  net: ethernet: Fix a unused function warning. (2018-09-17 08:24:25 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
tags/mlx5-fixes-2018-09-17

for you to fetch changes up to 8f92e35aff9692028279d3c03e88547df6d15020:

  net/mlx5e: TLS, Read capabilities only when it is safe (2018-09-17 15:12:31 
-0700)


mlx5-fixes-2018-09-17


Alaa Hleihel (1):
  net/mlx5: Check for SQ and not RQ state when modifying hairpin SQ

Eli Cohen (1):
  net/mlx5: Fix read from coherent memory

Saeed Mahameed (1):
  net/mlx5e: TLS, Read capabilities only when it is safe

 drivers/net/ethernet/mellanox/mlx5/core/cmd.c  | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)


[net-next 3/3] net/mlx5e: TLS, Read capabilities only when it is safe

2018-09-17 Thread Saeed Mahameed
Read TLS caps from the core driver only when TLS is supported, i.e
mlx5_accel_is_tls_device returns true.

Fixes: 790af90c00d2 ("net/mlx5e: TLS, build TLS netdev from capabilities")
Reported-by: Michal Kubecek 
Signed-off-by: Saeed Mahameed 
Reviewed-by: Boris Pismenny 
Reviewed-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
index eddd7702680b..e88340e196f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
@@ -183,12 +183,13 @@ static const struct tlsdev_ops mlx5e_tls_ops = {
 
 void mlx5e_tls_build_netdev(struct mlx5e_priv *priv)
 {
-   u32 caps = mlx5_accel_tls_device_caps(priv->mdev);
struct net_device *netdev = priv->netdev;
+   u32 caps;
 
if (!mlx5_accel_is_tls_device(priv->mdev))
return;
 
+   caps = mlx5_accel_tls_device_caps(priv->mdev);
if (caps & MLX5_ACCEL_TLS_TX) {
netdev->features  |= NETIF_F_HW_TLS_TX;
netdev->hw_features   |= NETIF_F_HW_TLS_TX;
-- 
2.17.1



[net-next 1/3] net/mlx5: Fix read from coherent memory

2018-09-17 Thread Saeed Mahameed
From: Eli Cohen 

Use accessor function READ_ONCE to read from coherent memory modified
by the device and read by the driver. This becomes most important in
preemptive kernels where cond_resched implementation does not have the
side effect which guaranteed the updated value.

Fixes: 269d26f47f6f ("net/mlx5: Reduce command polling interval")
Signed-off-by: Eli Cohen 
Reported-by: Jesper Dangaard Brouer 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c 
b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 3ce14d42ddc8..a53736c26c0c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -206,7 +206,7 @@ static void poll_timeout(struct mlx5_cmd_work_ent *ent)
u8 own;
 
do {
-   own = ent->lay->status_own;
+   own = READ_ONCE(ent->lay->status_own);
if (!(own & CMD_OWNER_HW)) {
ent->ret = 0;
return;
-- 
2.17.1



Re: [PATCH net-next 5/5] ebpf: Add sample ebpf program for SOCKET_SG_FILTER

2018-09-17 Thread Alexei Starovoitov
On Mon, Sep 17, 2018 at 07:23:48PM -0400, Sowmini Varadhan wrote:
> On (09/17/18 16:15), Alexei Starovoitov wrote:
> > 
> > if the goal is to add firewall ability to RDS then the patch set
> > is going in the wrong direction.
> 
> The goal is to add the ability to process scatterlist directly,
> just like we process skb's today.

that doesn't answer my question and doesn't address the objection.
It is still a nack.



Re: [PATCH net-next 5/5] ebpf: Add sample ebpf program for SOCKET_SG_FILTER

2018-09-17 Thread Sowmini Varadhan
On (09/17/18 16:15), Alexei Starovoitov wrote:
> 
> if the goal is to add firewall ability to RDS then the patch set
> is going in the wrong direction.

The goal is to add the ability to process scatterlist directly,
just like we process skb's today.

Your main objection was that you wanted a test case in selftests
that was aligned with existing tests, Tushar is working on that
patchset. Why dont we wait for that patchset before continuing
this discussion further? 

> May be the right answer is to teach rds to behave like the rest of protocols.
> Then all existing tooling and features will 'just work' ?

RDS does not need to be taught anything :-) I think KCM is modeled
on the RDS stack model. Before we "teach" rds anything, "we" need
to understand what RDS does first - google can provide lot of slide-decks
that explain the rds stack to you, suggest you look at that first. 

Meanwhile, how about waiting for Tushar's next patchset, where
you will have your selftests that are based on veth/netns
just like exising tests for XDP. vxlan etc. I strongly suggest
waiting for that.

And btw, it would have been very useful/courteous to help with 
the RFC reviews to start with.

--Sowmini



Re: [PATCH net-next 5/5] ebpf: Add sample ebpf program for SOCKET_SG_FILTER

2018-09-17 Thread Alexei Starovoitov
On Thu, Sep 13, 2018 at 06:10:13AM -0400, Sowmini Varadhan wrote:
> On (09/12/18 19:07), Alexei Starovoitov wrote:
> > 
> > I didn't know that. The way I understand your statement that
> > this new program type, new sg logic, and all the complexity
> > are only applicable to RDMA capable hw and RDS.
> 
> I dont know if you have been following the RFC series at all 
> (and DanielB/JohnF feedback to it) but that is not what the patch 
> set is about.
> 
> To repeat a summary of the original problem statement:
> 
> RDS (hardly a "niche" driver, let's please not get carried away 
> with strong assertions based on incomplete understanding), 
> is an example of a driver that happens to pass up packets
> as both scatterlist and sk_buffs to the ULPs. 
> 
> The scatterlist comes from IB, the sk_buffs come from the ethernet
> drivers. At the moment, the only way to build firewalls for
> this is to convert scatterlist to skb and use  either netfilter
> or eBPF on the skb. What Tushar is adding is support to use eBPF
> on the scatterlist itself, so that you dont have to do this
> inefficient scatterlist->skb conversion.

if the goal is to add firewall ability to RDS then the patch set
is going in the wrong direction.
New bpf prog type and attaching to sockets isn't going to be
helpful in building firewalls.
Also there was a mention of some form of 'redirect' for some
future use? That doesn't fit the firewall goal as well.
I think it would be the best to start from scratch and discuss
the bigger goal first.
May be the right answer is to teach rds to behave like the rest of protocols.
Then all existing tooling and features will 'just work' ?



[PATCH bpf-next v2] tools/bpf: bpftool: improve output format for bpftool net

2018-09-17 Thread Yonghong Song
This is a followup patch for Commit f6f3bac08ff9
("tools/bpf: bpftool: add net support").
Some improvements are made for the bpftool net output.
Specially, plain output is more concise such that
per attachment should nicely fit in one line.
Compared to previous output, the prog tag is removed
since it can be easily obtained with program id.
Similar to xdp attachments, the device name is added
to tc attachments.

The bpf program attached through shared block
mechanism is supported as well.
  $ ip link add dev v1 type veth peer name v2
  $ tc qdisc add dev v1 ingress_block 10 egress_block 20 clsact
  $ tc qdisc add dev v2 ingress_block 10 egress_block 20 clsact
  $ tc filter add block 10 protocol ip prio 25 bpf obj bpf_shared.o sec ingress 
flowid 1:1
  $ tc filter add block 20 protocol ip prio 30 bpf obj bpf_cyclic.o sec 
classifier flowid 1:1
  $ bpftool net
  xdp:

  tc:
  v2(7) clsact/ingress bpf_shared.o:[ingress] id 23
  v2(7) clsact/egress bpf_cyclic.o:[classifier] id 24
  v1(8) clsact/ingress bpf_shared.o:[ingress] id 23
  v1(8) clsact/egress bpf_cyclic.o:[classifier] id 24

The documentation and "bpftool net help" are updated
to make it clear that current implementation only
supports xdp and tc attachments. For programs
attached to cgroups, "bpftool cgroup" can be used
to dump attachments. For other programs e.g.
sk_{filter,skb,msg,reuseport} and lwt/seg6,
iproute2 tools should be used.

The new output:
  $ bpftool net
  xdp:
  eth0(2) driver id 198

  tc:
  eth0(2) clsact/ingress fbflow_icmp id 335 act [{icmp_action id 336}]
  eth0(2) clsact/egress fbflow_egress id 334
  $ bpftool -jp net
  [{
"xdp": [{
"devname": "eth0",
"ifindex": 2,
"mode": "driver",
"id": 198
}
],
"tc": [{
"devname": "eth0",
"ifindex": 2,
"kind": "clsact/ingress",
"name": "fbflow_icmp",
"id": 335,
"act": [{
"name": "icmp_action",
"id": 336
}
]
},{
"devname": "eth0",
"ifindex": 2,
"kind": "clsact/egress",
"name": "fbflow_egress",
"id": 334
}
]
}
  ]

Signed-off-by: Yonghong Song 
---
 .../bpf/bpftool/Documentation/bpftool-net.rst |  78 +++--
 tools/bpf/bpftool/main.h  |   3 +-
 tools/bpf/bpftool/net.c   | 103 --
 tools/bpf/bpftool/netlink_dumper.c|  85 +++
 tools/bpf/bpftool/netlink_dumper.h|  22 ++--
 5 files changed, 161 insertions(+), 130 deletions(-)

Changelogs:
 v1 -> v2:
. Addressed several output format suggestions from Daniel

diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst 
b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index 48a61837a264..408ec30d8872 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -26,9 +26,20 @@ NET COMMANDS
 DESCRIPTION
 ===
**bpftool net { show | list } [ dev name ]**
- List all networking device driver and tc attachment in the 
system.
-
-  Output will start with all xdp program attachment, followed 
by
+  List bpf program attachments in the kernel networking 
subsystem.
+
+  Currently, only device driver xdp attachments and tc filter
+  classification/action attachments are implemented, i.e., for
+  program types **BPF_PROG_TYPE_SCHED_CLS**,
+  **BPF_PROG_TYPE_SCHED_ACT** and **BPF_PROG_TYPE_XDP**.
+  For programs attached to a particular cgroup, e.g.,
+  **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**,
+  **BPF_PROG_TYPE_SOCK_OPS** and 
**BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+  users can use **bpftool cgroup** to dump cgroup attachments.
+  For sk_{filter, skb, msg, reuseport} and lwt/seg6
+  bpf programs, users should consult other tools, e.g., 
iproute2.
+
+  The current output will start with all xdp program 
attachments, followed by
   all tc class/qdisc bpf program attachments. Both xdp 
programs and
   tc programs are ordered based on ifindex number. If multiple 
bpf
   programs attached to the same networking device through **tc 
filter**,
@@ -61,21 +72,15 @@ EXAMPLES
 
 ::
 
-  xdp [
-  ifindex 2 devname eth0 prog_id 198
-  ]
-  tc_filters [
-  ifindex 2 kind qdisc_htb name prefix_matcher.o:[cls_prefix_matcher_htb]
-prog_id 111727 tag d08fe3b4319bc2fd act []
-  ifindex 2 kind qdisc_clsact_ingress name fbflow_icmp
-prog_id 130246 tag 3f265c7f26db62c9 act []
-  

Re: [PATCH bpf-next] tools/bpf: bpftool: improve output format for bpftool net

2018-09-17 Thread Yonghong Song


On 9/17/18 3:19 AM, Daniel Borkmann wrote:
> On 09/14/2018 11:49 PM, Yonghong Song wrote:
>> This is a followup patch for Commit f6f3bac08ff9
>> ("tools/bpf: bpftool: add net support").
>> Some improvements are made for the bpftool net output.
>> Specially, plain output is more concise such that
>> per attachment should nicely fit in one line.
>> Compared to previous output, the prog tag is removed
>> since it can be easily obtained with program id.
>> Similar to xdp attachments, the device name is added
>> to tc_filters attachments.
>>
>> The bpf program attached through shared block
>> mechanism is supported as well.
>>$ ip link add dev v1 type veth peer name v2
>>$ tc qdisc add dev v1 ingress_block 10 egress_block 20 clsact
>>$ tc qdisc add dev v2 ingress_block 10 egress_block 20 clsact
>>$ tc filter add block 10 protocol ip prio 25 bpf obj bpf_shared.o sec 
>> ingress flowid 1:1
>>$ tc filter add block 20 protocol ip prio 30 bpf obj bpf_cyclic.o sec 
>> classifier flowid 1:1
>>$ bpftool net
>>xdp [
>>]
>>tc_filters [
>> v2(7) qdisc_clsact_ingress bpf_shared.o:[ingress] id 23
>> v2(7) qdisc_clsact_egress bpf_cyclic.o:[classifier] id 24
>> v1(8) qdisc_clsact_ingress bpf_shared.o:[ingress] id 23
>> v1(8) qdisc_clsact_egress bpf_cyclic.o:[classifier] id 24
> 
> Just one minor note for this one here, do we even need the "qdisc_" prefix? 
> Couldn't it just simply
> be "clsact/ingress", "clsact/egress", "htb" etc?

Will do.

> 
>>]
>>
>> The documentation and "bpftool net help" are updated
>> to make it clear that current implementation only
>> supports xdp and tc attachments. For programs
>> attached to cgroups, "bpftool cgroup" can be used
>> to dump attachments. For other programs e.g.
>> sk_{filter,skb,msg,reuseport} and lwt/seg6,
>> iproute2 tools should be used.
>>
>> The new output:
>>$ bpftool net
>>xdp [
>> eth0(2) id/drv 198
> 
> Could we change the "id/{drv,offload,generic} xyz" into e.g. "eth0(2) 
> {driver,offload,generic} id 198",
> meaning, the "id xyz" being a child of either "driver", "offload" or 
> "generic". Reason would be two-fold:
> i) we can keep the "id xyz" notion consistent as used under "tc_filters", and 
> ii) it allows to put further
> information aside from just "id" member under "driver", "offload" or 
> "generic" in the future.

Will do.

> 
>>]
>>tc_filters [
> 
> Nit: can we use just "tc" for the above? Main use case would be clsact with 
> one of its two hooks anyway,
> and the term "filter" is sort of tc historic; while being correct bpf progs 
> would do much more than just
> filtering, and context is pretty clear anyway from qdisc that we subsequently 
> dump.

Make sense.

Will address all these comments and submit a revision soon. Thanks!

> 
>> eth0(2) qdisc_clsact_ingress fbflow_icmp id 335 act [{icmp_action id 
>> 336}]
>> eth0(2) qdisc_clsact_egress fbflow_egress id 334
>>]
>>$ bpftool -jp net
>>[{
>>  "xdp": [{
>>  "devname": "eth0",
>>  "ifindex": 2,
>>  "id/drv": 198
>>  }
>>  ],
>>  "tc_filters": [{
>>  "devname": "eth0",
>>  "ifindex": 2,
>>  "kind": "qdisc_clsact_ingress",
>>  "name": "fbflow_icmp",
>>  "id": 335,
>>  "act": [{
>>  "name": "icmp_action",
>>  "id": 336
>>  }
>>  ]
>>  },{
>>  "devname": "eth0",
>>  "ifindex": 2,
>>  "kind": "qdisc_clsact_egress",
>>  "name": "fbflow_egress",
>>  "id": 334
>>  }
>>  ]
>>  }
>>]
>>
>> Signed-off-by: Yonghong Song 


Re: [bpf PATCH 3/3] bpf: test_maps, only support ESTABLISHED socks

2018-09-17 Thread Y Song
On Mon, Sep 17, 2018 at 10:33 AM John Fastabend
 wrote:
>
> Ensure that sockets added to a sock{map|hash} that is not in the
> ESTABLISHED state is rejected.
>
> Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
> Signed-off-by: John Fastabend 
> ---
>  tools/testing/selftests/bpf/test_maps.c |   10 +++---
>  1 file changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/test_maps.c 
> b/tools/testing/selftests/bpf/test_maps.c
> index 6f54f84..0f2090f 100644
> --- a/tools/testing/selftests/bpf/test_maps.c
> +++ b/tools/testing/selftests/bpf/test_maps.c
> @@ -580,7 +580,11 @@ static void test_sockmap(int tasks, void *data)
> /* Test update without programs */
> for (i = 0; i < 6; i++) {
> err = bpf_map_update_elem(fd, , [i], BPF_ANY);
> -   if (err) {
> +   if (i < 2 && !err) {
> +   printf("Allowed update sockmap '%i:%i' not in 
> ESTABLISHED\n",
> +  i, sfd[i]);
> +   goto out_sockmap;
> +   } else if (i > 1 && err) {

Just a nit. Maybe "i >= 2" since it will be more clear since it is
opposite of "i < 2"?

> printf("Failed noprog update sockmap '%i:%i'\n",
>i, sfd[i]);
> goto out_sockmap;
> @@ -741,7 +745,7 @@ static void test_sockmap(int tasks, void *data)
> }
>
> /* Test map update elem afterwards fd lives in fd and map_fd */
> -   for (i = 0; i < 6; i++) {
> +   for (i = 2; i < 6; i++) {
> err = bpf_map_update_elem(map_fd_rx, , [i], BPF_ANY);
> if (err) {
> printf("Failed map_fd_rx update sockmap %i '%i:%i'\n",
> @@ -845,7 +849,7 @@ static void test_sockmap(int tasks, void *data)
> }
>
> /* Delete the elems without programs */
> -   for (i = 0; i < 6; i++) {
> +   for (i = 2; i < 6; i++) {
> err = bpf_map_delete_elem(fd, );
> if (err) {
> printf("Failed delete sockmap %i '%i:%i'\n",
>


Re: [bpf PATCH 2/3] bpf: sockmap, fix transition through disconnect without close

2018-09-17 Thread Y Song
On Mon, Sep 17, 2018 at 10:32 AM John Fastabend
 wrote:
>
> It is possible (via shutdown()) for TCP socks to go trough TCP_CLOSE
> state via tcp_disconnect() without actually calling tcp_close which
> would then call our bpf_tcp_close() callback. Because of this a user
> could disconnect a socket then put it in a LISTEN state which would
> break our assumptions about sockets always being ESTABLISHED state.
>
> To resolve this rely on the unhash hook, which is called in the
> disconnect case, to remove the sock from the sockmap.
>
> Reported-by: Eric Dumazet 
> Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
> Signed-off-by: John Fastabend 
> ---
>  kernel/bpf/sockmap.c |   71 
> +-
>  1 file changed, 52 insertions(+), 19 deletions(-)
>
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 998b7bd..f6ab7f3 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -132,6 +132,7 @@ struct smap_psock {
> struct work_struct gc_work;
>
> struct proto *sk_proto;
> +   void (*save_unhash)(struct sock *sk);
> void (*save_close)(struct sock *sk, long timeout);
> void (*save_data_ready)(struct sock *sk);
> void (*save_write_space)(struct sock *sk);
> @@ -143,6 +144,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr 
> *msg, size_t len,
>  static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
>  static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
> int offset, size_t size, int flags);
> +static void bpf_tcp_unhash(struct sock *sk);
>  static void bpf_tcp_close(struct sock *sk, long timeout);
>
>  static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
> @@ -184,6 +186,7 @@ static void build_protos(struct proto 
> prot[SOCKMAP_NUM_CONFIGS],
>  struct proto *base)
>  {
> prot[SOCKMAP_BASE]  = *base;
> +   prot[SOCKMAP_BASE].unhash   = bpf_tcp_unhash;
> prot[SOCKMAP_BASE].close= bpf_tcp_close;
> prot[SOCKMAP_BASE].recvmsg  = bpf_tcp_recvmsg;
> prot[SOCKMAP_BASE].stream_memory_read   = bpf_tcp_stream_read;
> @@ -217,6 +220,7 @@ static int bpf_tcp_init(struct sock *sk)
> return -EBUSY;
> }
>
> +   psock->save_unhash = sk->sk_prot->unhash;
> psock->save_close = sk->sk_prot->close;
> psock->sk_proto = sk->sk_prot;
>
> @@ -305,30 +309,12 @@ static struct smap_psock_map_entry 
> *psock_map_pop(struct sock *sk,
> return e;
>  }
>
> -static void bpf_tcp_close(struct sock *sk, long timeout)
> +static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
>  {
> -   void (*close_fun)(struct sock *sk, long timeout);
> struct smap_psock_map_entry *e;
> struct sk_msg_buff *md, *mtmp;
> -   struct smap_psock *psock;
> struct sock *osk;
>
> -   lock_sock(sk);
> -   rcu_read_lock();
> -   psock = smap_psock_sk(sk);
> -   if (unlikely(!psock)) {
> -   rcu_read_unlock();
> -   release_sock(sk);
> -   return sk->sk_prot->close(sk, timeout);
> -   }
> -
> -   /* The psock may be destroyed anytime after exiting the RCU critial
> -* section so by the time we use close_fun the psock may no longer
> -* be valid. However, bpf_tcp_close is called with the sock lock
> -* held so the close hook and sk are still valid.
> -*/
> -   close_fun = psock->save_close;
> -
> if (psock->cork) {
> free_start_sg(psock->sock, psock->cork, true);
> kfree(psock->cork);
> @@ -379,6 +365,53 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
> kfree(e);
> e = psock_map_pop(sk, psock);
> }
> +}
> +
> +static void bpf_tcp_unhash(struct sock *sk)
> +{
> +   void (*unhash_fun)(struct sock *sk);
> +   struct smap_psock *psock;
> +
> +   rcu_read_lock();
> +   psock = smap_psock_sk(sk);
> +   if (unlikely(!psock)) {
> +   rcu_read_unlock();
> +   release_sock(sk);

Can socket be released here?

> +   return sk->sk_prot->unhash(sk);
> +   }
> +
> +   /* The psock may be destroyed anytime after exiting the RCU critial
> +* section so by the time we use close_fun the psock may no longer
> +* be valid. However, bpf_tcp_close is called with the sock lock
> +* held so the close hook and sk are still valid.
> +*/

the comments above are not correct. A copy-paste mistake?

> +   unhash_fun = psock->save_unhash;
> +   bpf_tcp_remove(sk, psock);
> +   rcu_read_unlock();
> +   unhash_fun(sk);
> +}
> +
> +static void bpf_tcp_close(struct sock *sk, long timeout)
> +{
> +   void (*close_fun)(struct sock *sk, long timeout);
> +   struct smap_psock 

Re: [bpf PATCH 1/3] bpf: sockmap only allow ESTABLISHED sock state

2018-09-17 Thread Y Song
On Mon, Sep 17, 2018 at 10:32 AM John Fastabend
 wrote:
>
> After this patch we only allow socks that are in ESTABLISHED state or
> are being added via a sock_ops event that is transitioning into an
> ESTABLISHED state. By allowing sock_ops events we allow users to
> manage sockmaps directly from sock ops programs. The two supported
> sock_ops ops are BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB and
> BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB.
>
> Similar to TLS ULP this ensures sk_user_data is correct.
>
> Reported-by: Eric Dumazet 
> Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
> Signed-off-by: John Fastabend 

Acked-by: Yonghong Song 


Re: [PATCH v2 net] net/ipv4: defensive cipso option parsing

2018-09-17 Thread Paul Moore
On Mon, Sep 17, 2018 at 1:49 PM Stefan Nuernberger  wrote:
> commit 40413955ee26 ("Cipso: cipso_v4_optptr enter infinite loop") fixed
> a possible infinite loop in the IP option parsing of CIPSO. The fix
> assumes that ip_options_compile filtered out all zero length options and
> that no other one-byte options beside IPOPT_END and IPOPT_NOOP exist.
> While this assumption currently holds true, add explicit checks for zero
> length and invalid length options to be safe for the future. Even though
> ip_options_compile should have validated the options, the introduction of
> new one-byte options can still confuse this code without the additional
> checks.
>
> Signed-off-by: Stefan Nuernberger 
> Cc: David Woodhouse 
> Cc: Simon Veith 
> Cc: sta...@vger.kernel.org
> ---
>  net/ipv4/cipso_ipv4.c | 11 +++
>  1 file changed, 7 insertions(+), 4 deletions(-)

See my previous comments about the necessity of this patch, but beyond
that it looks fine to me.

Acked-by: Paul Moore 

> diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
> index 82178cc69c96..777fa3b7fb13 100644
> --- a/net/ipv4/cipso_ipv4.c
> +++ b/net/ipv4/cipso_ipv4.c
> @@ -1512,7 +1512,7 @@ static int cipso_v4_parsetag_loc(const struct 
> cipso_v4_doi *doi_def,
>   *
>   * Description:
>   * Parse the packet's IP header looking for a CIPSO option.  Returns a 
> pointer
> - * to the start of the CIPSO option on success, NULL if one if not found.
> + * to the start of the CIPSO option on success, NULL if one is not found.
>   *
>   */
>  unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
> @@ -1522,10 +1522,8 @@ unsigned char *cipso_v4_optptr(const struct sk_buff 
> *skb)
> int optlen;
> int taglen;
>
> -   for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) {
> +   for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) {

Not worth re-spinning this patch, but looking at this a bit closer, we
could probably optimize the "optlen > 1" tweak a bit further by using
CIPSO_V4_HDR_LEN instead of "1" since we only care about CIPSO headers
here.

Although given the nature of IPv4 options, I'm not sure this would
ever really have an impact, let alone a noticeable impact.

> switch (optptr[0]) {
> -   case IPOPT_CIPSO:
> -   return optptr;
> case IPOPT_END:
> return NULL;
> case IPOPT_NOOP:
> @@ -1534,6 +1532,11 @@ unsigned char *cipso_v4_optptr(const struct sk_buff 
> *skb)
> default:
> taglen = optptr[1];
> }
> +   if (!taglen || taglen > optlen)
> +   return NULL;
> +   if (optptr[0] == IPOPT_CIPSO)
> +   return optptr;
> +
> optlen -= taglen;
> optptr += taglen;
> }
> --
> 2.19.0

-- 
paul moore
www.paul-moore.com


Re: [PATCH v2 2/2] netlink: add ethernet address policy types

2018-09-17 Thread Marcelo Ricardo Leitner
On Mon, Sep 17, 2018 at 11:57:29AM +0200, Johannes Berg wrote:
> From: Johannes Berg 
> 
> Commonly, ethernet addresses are just using a policy of
>   { .len = ETH_ALEN }
> which leaves userspace free to send more data than it should,
> which may hide bugs.
> 
> Introduce NLA_EXACT_LEN which checks for exact size, rejecting
> the attribute if it's not exactly that length. Also add
> NLA_EXACT_LEN_WARN which requires the minimum length and will
> warn on longer attributes, for backward compatibility.
> 
> Use these to define NLA_POLICY_ETH_ADDR (new strict policy) and
> NLA_POLICY_ETH_ADDR_COMPAT (compatible policy with warning);
> these are used like this:
> 
> static const struct nla_policy [...] = {
> [NL_ATTR_NAME] = NLA_POLICY_ETH_ADDR,
> ...
> };
> 
> Signed-off-by: Johannes Berg 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
> v2: add only NLA_EXACT_LEN/NLA_EXACT_LEN_WARN and build on top
> of that for ethernet address validation, so it can be extended
> for other types (e.g. IPv6 addresses)
> ---
>  include/net/netlink.h | 13 +
>  lib/nlattr.c  |  8 +++-
>  2 files changed, 20 insertions(+), 1 deletion(-)
> 
> diff --git a/include/net/netlink.h b/include/net/netlink.h
> index b318b0a9f6c3..318b1ded3833 100644
> --- a/include/net/netlink.h
> +++ b/include/net/netlink.h
> @@ -181,6 +181,8 @@ enum {
>   NLA_S64,
>   NLA_BITFIELD32,
>   NLA_REJECT,
> + NLA_EXACT_LEN,
> + NLA_EXACT_LEN_WARN,
>   __NLA_TYPE_MAX,
>  };
>  
> @@ -211,6 +213,10 @@ enum {
>   * just like "All other"
>   *NLA_BITFIELD32   Unused
>   *NLA_REJECT   Unused
> + *NLA_EXACT_LENAttribute must have exactly this length, otherwise
> + * it is rejected.
> + *NLA_EXACT_LEN_WARN   Attribute should have exactly this length, a 
> warning
> + * is logged if it is longer, shorter is rejected.
>   *All otherMinimum length of attribute payload
>   *
>   * Meaning of `validation_data' field:
> @@ -236,6 +242,13 @@ struct nla_policy {
>   void*validation_data;
>  };
>  
> +#define NLA_POLICY_EXACT_LEN(_len)   { .type = NLA_EXACT_LEN, .len = _len }
> +#define NLA_POLICY_EXACT_LEN_WARN(_len)  { .type = NLA_EXACT_LEN_WARN, \
> +   .len = _len }
> +
> +#define NLA_POLICY_ETH_ADDR  NLA_POLICY_EXACT_LEN(ETH_ALEN)
> +#define NLA_POLICY_ETH_ADDR_COMPAT   NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN)
> +
>  /**
>   * struct nl_info - netlink source information
>   * @nlh: Netlink message header of original request
> diff --git a/lib/nlattr.c b/lib/nlattr.c
> index 36d74b079151..bb6fe5ed4ecf 100644
> --- a/lib/nlattr.c
> +++ b/lib/nlattr.c
> @@ -82,12 +82,18 @@ static int validate_nla(const struct nlattr *nla, int 
> maxtype,
>  
>   BUG_ON(pt->type > NLA_TYPE_MAX);
>  
> - if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) {
> + if ((nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) ||
> + (pt->type == NLA_EXACT_LEN_WARN && attrlen != pt->len)) {
>   pr_warn_ratelimited("netlink: '%s': attribute type %d has an 
> invalid length.\n",
>   current->comm, type);
>   }
>  
>   switch (pt->type) {
> + case NLA_EXACT_LEN:
> + if (attrlen != pt->len)
> + return -ERANGE;
> + break;
> +
>   case NLA_REJECT:
>   if (pt->validation_data && error_msg)
>   *error_msg = pt->validation_data;
> -- 
> 2.14.4
> 


Re: [PATCH v2 1/2] netlink: add NLA_REJECT policy type

2018-09-17 Thread Marcelo Ricardo Leitner
On Mon, Sep 17, 2018 at 11:57:28AM +0200, Johannes Berg wrote:
> From: Johannes Berg 
> 
> In some situations some netlink attributes may be used for output
> only (kernel->userspace) or may be reserved for future use. It's
> then helpful to be able to prevent userspace from using them in
> messages sent to the kernel, since they'd otherwise be ignored and
> any future will become impossible if this happens.
> 
> Add NLA_REJECT to the policy which does nothing but reject (with
> EINVAL) validation of any messages containing this attribute.
> Allow for returning a specific extended ACK error message in the
> validation_data pointer.
> 
> While at it clear up the documentation a bit - the NLA_BITFIELD32
> documentation was added to the list of len field descriptions.
> 
> Also, use NL_SET_BAD_ATTR() in one place where it's open-coded.
> 
> The specific case I have in mind now is a shared nested attribute
> containing request/response data, and it would be pointless and
> potentially confusing to have userspace include response data in
> the messages that actually contain a request.
> 
> Signed-off-by: Johannes Berg 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
> v2: preserve behaviour of overwriting the extack message, with
> either the generic or the specific one now
> ---
>  include/net/netlink.h | 13 -
>  lib/nlattr.c  | 23 ---
>  2 files changed, 28 insertions(+), 8 deletions(-)
> 
> diff --git a/include/net/netlink.h b/include/net/netlink.h
> index 0c154f98e987..b318b0a9f6c3 100644
> --- a/include/net/netlink.h
> +++ b/include/net/netlink.h
> @@ -180,6 +180,7 @@ enum {
>   NLA_S32,
>   NLA_S64,
>   NLA_BITFIELD32,
> + NLA_REJECT,
>   __NLA_TYPE_MAX,
>  };
>  
> @@ -208,9 +209,19 @@ enum {
>   *NLA_MSECSLeaving the length field zero will verify the
>   * given type fits, using it verifies minimum length
>   * just like "All other"
> - *NLA_BITFIELD32  A 32-bit bitmap/bitselector attribute
> + *NLA_BITFIELD32   Unused
> + *NLA_REJECT   Unused
>   *All otherMinimum length of attribute payload
>   *
> + * Meaning of `validation_data' field:
> + *NLA_BITFIELD32   This is a 32-bit bitmap/bitselector attribute and
> + * validation data must point to a u32 value of valid
> + * flags
> + *NLA_REJECT   This attribute is always rejected and validation 
> data
> + * may point to a string to report as the error 
> instead
> + * of the generic one in extended ACK.
> + *All otherUnused
> + *
>   * Example:
>   * static const struct nla_policy my_policy[ATTR_MAX+1] = {
>   *   [ATTR_FOO] = { .type = NLA_U16 },
> diff --git a/lib/nlattr.c b/lib/nlattr.c
> index e335bcafa9e4..36d74b079151 100644
> --- a/lib/nlattr.c
> +++ b/lib/nlattr.c
> @@ -69,7 +69,8 @@ static int validate_nla_bitfield32(const struct nlattr *nla,
>  }
>  
>  static int validate_nla(const struct nlattr *nla, int maxtype,
> - const struct nla_policy *policy)
> + const struct nla_policy *policy,
> + const char **error_msg)
>  {
>   const struct nla_policy *pt;
>   int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla);
> @@ -87,6 +88,11 @@ static int validate_nla(const struct nlattr *nla, int 
> maxtype,
>   }
>  
>   switch (pt->type) {
> + case NLA_REJECT:
> + if (pt->validation_data && error_msg)
> + *error_msg = pt->validation_data;
> + return -EINVAL;
> +
>   case NLA_FLAG:
>   if (attrlen > 0)
>   return -ERANGE;
> @@ -180,11 +186,10 @@ int nla_validate(const struct nlattr *head, int len, 
> int maxtype,
>   int rem;
>  
>   nla_for_each_attr(nla, head, len, rem) {
> - int err = validate_nla(nla, maxtype, policy);
> + int err = validate_nla(nla, maxtype, policy, NULL);
>  
>   if (err < 0) {
> - if (extack)
> - extack->bad_attr = nla;
> + NL_SET_BAD_ATTR(extack, nla);
>   return err;
>   }
>   }
> @@ -250,11 +255,15 @@ int nla_parse(struct nlattr **tb, int maxtype, const 
> struct nlattr *head,
>   u16 type = nla_type(nla);
>  
>   if (type > 0 && type <= maxtype) {
> + static const char _msg[] = "Attribute failed policy 
> validation";
> + const char *msg = _msg;
> +
>   if (policy) {
> - err = validate_nla(nla, maxtype, policy);
> + err = validate_nla(nla, maxtype, policy, );
>   if (err < 0) {
> - NL_SET_ERR_MSG_ATTR(extack, nla,
> - 

Re: [PATCH rdma-next 00/24] Extend DEVX functionality

2018-09-17 Thread Leon Romanovsky
On Mon, Sep 17, 2018 at 11:13:55PM +0300, Or Gerlitz wrote:
> On Mon, Sep 17, 2018 at 11:07 PM, Leon Romanovsky  wrote:
> > On Mon, Sep 17, 2018 at 10:51:29PM +0300, Or Gerlitz wrote:
> >> On Mon, Sep 17, 2018 at 10:34 PM, Leon Romanovsky  
> >> wrote:
> >> > On Mon, Sep 17, 2018 at 02:03:53PM +0300, Leon Romanovsky wrote:
> >> >> From: Leon Romanovsky 
> >> >>
> >> >> From Yishai,
> >> >>
> >> >> This series comes to enable the DEVX functionality in some wider scope,
> >> >> specifically,
> >> >> - It enables using kernel objects that were created by the verbs
> >> >>   API in the DEVX flow.
> >> >> - It enables white list commands without DEVX user context.
> >> >> - It enables the IB link layer under CAP_NET_RAW capabilities.
> >> >> - It exposes the PRM handles for RAW QP (i.e. TIRN, TISN, RQN, SQN)
> >> >>   to be used later on directly by the DEVX interface.
> >> >>
> >> >> In General,
> >> >> Each object that is created/destroyed/modified via verbs will be stamped
> >> >> with a UID based on its user context. This is already done for DEVX 
> >> >> objects
> >> >> commands.
> >> >>
> >> >> This will enable the firmware to enforce the usage of kernel objects
> >> >> from the DEVX flow by validating that the same UID is used and the 
> >> >> resources are
> >> >> really related to the same user.
> >> >>
> >> >> For example in case a CQ was created with verbs it will be stamped with
> >> >> UID and once will be pointed by a DEVX create QP command the firmware 
> >> >> will
> >> >> validate that the input CQN really belongs to the UID which issues the 
> >> >> create QP
> >> >> command.
> >> >>
> >> >> As of the above, all the PRM objects (except of the public ones which
> >> >> are managed by the kernel e.g. FLOW, etc.) will have a UID upon their
> >> >> create/modify/destroy commands. The detection of UMEM / physical
> >> >> addressed in the relevant commands will be done by firmware according 
> >> >> to a 'umem
> >> >> valid bit' as the UID may be used in both cases.
> >> >>
> >> >> The series also enables white list commands which don't require a
> >> >> specific DEVX context, instead of this a device UID is used so that
> >> >> the firmware will mask un-privileged functionality. The IB link layer
> >> >> is also enabled once CAP_NET_RAW permission exists.
> >> >>
> >> >> To enable using the RAW QP underlay objects (e.g. TIRN, RQN, etc.) later
> >> >> on by DEVX commands the UHW output for this case was extended to return 
> >> >> this
> >> >> data when a DEVX context is used.
> >> >>
> >> >> Thanks
> >> >>
> >> >> Leon Romanovsky (1):
> >> >>   net/mlx5: Update mlx5_ifc with DEVX UID bits
> >> >>
> >> >> Yishai Hadas (24):
> >> >>   net/mlx5: Set uid as part of CQ commands
> >> >>   net/mlx5: Set uid as part of QP commands
> >> >>   net/mlx5: Set uid as part of RQ commands
> >> >>   net/mlx5: Set uid as part of SQ commands
> >> >>   net/mlx5: Set uid as part of SRQ commands
> >> >>   net/mlx5: Set uid as part of DCT commands
> >> >
> >> > Hi Doug and Jason,
> >> >
> >> > Do you want me to resend 7 patches above in one series and other patches
> >> > in another series just to be below 15 patches limit? Please be aware
> >> > that those patches above are going to mlx5-next and not to
> >> > net-next/rdma-next.
> >> >
> >> > No rebase, no code change, no much meaning too, but it is your call.
> >>
> >> how about yes! for stop shitting on Dave Miller?
> >
> > Or, are you ok?
> >
> > This series is not relevant to Dave Miller and he didn't even listed in CC 
> > or TO.
>
> correct, but Dave asked MLNX/Saeed to do X, you should respect X when you post
> to the community Dave is maintaining, even if he didn't ask you, not
> doing so hurts
> our positioning with Dave.

Saeed is going to see/apply/review first 7 patches, which is less than 15,
so we are ok here.

>
>
> >
> > I still prefer to hear answer from respective maintainer to whom this
> > series was sent.
>
> Your maintainer asked you to do X, just do it, once and for all

Both Doug and Jason known how to write emails, they will request "X"
if THEY decide that it is needed/better, there is no need to be their
voice.

Thanks


Re: [PATCH rdma-next 00/24] Extend DEVX functionality

2018-09-17 Thread Or Gerlitz
On Mon, Sep 17, 2018 at 11:07 PM, Leon Romanovsky  wrote:
> On Mon, Sep 17, 2018 at 10:51:29PM +0300, Or Gerlitz wrote:
>> On Mon, Sep 17, 2018 at 10:34 PM, Leon Romanovsky  
>> wrote:
>> > On Mon, Sep 17, 2018 at 02:03:53PM +0300, Leon Romanovsky wrote:
>> >> From: Leon Romanovsky 
>> >>
>> >> From Yishai,
>> >>
>> >> This series comes to enable the DEVX functionality in some wider scope,
>> >> specifically,
>> >> - It enables using kernel objects that were created by the verbs
>> >>   API in the DEVX flow.
>> >> - It enables white list commands without DEVX user context.
>> >> - It enables the IB link layer under CAP_NET_RAW capabilities.
>> >> - It exposes the PRM handles for RAW QP (i.e. TIRN, TISN, RQN, SQN)
>> >>   to be used later on directly by the DEVX interface.
>> >>
>> >> In General,
>> >> Each object that is created/destroyed/modified via verbs will be stamped
>> >> with a UID based on its user context. This is already done for DEVX 
>> >> objects
>> >> commands.
>> >>
>> >> This will enable the firmware to enforce the usage of kernel objects
>> >> from the DEVX flow by validating that the same UID is used and the 
>> >> resources are
>> >> really related to the same user.
>> >>
>> >> For example in case a CQ was created with verbs it will be stamped with
>> >> UID and once will be pointed by a DEVX create QP command the firmware will
>> >> validate that the input CQN really belongs to the UID which issues the 
>> >> create QP
>> >> command.
>> >>
>> >> As of the above, all the PRM objects (except of the public ones which
>> >> are managed by the kernel e.g. FLOW, etc.) will have a UID upon their
>> >> create/modify/destroy commands. The detection of UMEM / physical
>> >> addressed in the relevant commands will be done by firmware according to 
>> >> a 'umem
>> >> valid bit' as the UID may be used in both cases.
>> >>
>> >> The series also enables white list commands which don't require a
>> >> specific DEVX context, instead of this a device UID is used so that
>> >> the firmware will mask un-privileged functionality. The IB link layer
>> >> is also enabled once CAP_NET_RAW permission exists.
>> >>
>> >> To enable using the RAW QP underlay objects (e.g. TIRN, RQN, etc.) later
>> >> on by DEVX commands the UHW output for this case was extended to return 
>> >> this
>> >> data when a DEVX context is used.
>> >>
>> >> Thanks
>> >>
>> >> Leon Romanovsky (1):
>> >>   net/mlx5: Update mlx5_ifc with DEVX UID bits
>> >>
>> >> Yishai Hadas (24):
>> >>   net/mlx5: Set uid as part of CQ commands
>> >>   net/mlx5: Set uid as part of QP commands
>> >>   net/mlx5: Set uid as part of RQ commands
>> >>   net/mlx5: Set uid as part of SQ commands
>> >>   net/mlx5: Set uid as part of SRQ commands
>> >>   net/mlx5: Set uid as part of DCT commands
>> >
>> > Hi Doug and Jason,
>> >
>> > Do you want me to resend 7 patches above in one series and other patches
>> > in another series just to be below 15 patches limit? Please be aware
>> > that those patches above are going to mlx5-next and not to
>> > net-next/rdma-next.
>> >
>> > No rebase, no code change, no much meaning too, but it is your call.
>>
>> how about yes! for stop shitting on Dave Miller?
>
> Or, are you ok?
>
> This series is not relevant to Dave Miller and he didn't even listed in CC or 
> TO.

correct, but Dave asked MLNX/Saeed to do X, you should respect X when you post
to the community Dave is maintaining, even if he didn't ask you, not
doing so hurts
our positioning with Dave.


>
> I still prefer to hear answer from respective maintainer to whom this
> series was sent.

Your maintainer asked you to do X, just do it, once and for all


Re: [PATCH rdma-next 00/24] Extend DEVX functionality

2018-09-17 Thread Leon Romanovsky
On Mon, Sep 17, 2018 at 10:51:29PM +0300, Or Gerlitz wrote:
> On Mon, Sep 17, 2018 at 10:34 PM, Leon Romanovsky  wrote:
> > On Mon, Sep 17, 2018 at 02:03:53PM +0300, Leon Romanovsky wrote:
> >> From: Leon Romanovsky 
> >>
> >> From Yishai,
> >>
> >> This series comes to enable the DEVX functionality in some wider scope,
> >> specifically,
> >> - It enables using kernel objects that were created by the verbs
> >>   API in the DEVX flow.
> >> - It enables white list commands without DEVX user context.
> >> - It enables the IB link layer under CAP_NET_RAW capabilities.
> >> - It exposes the PRM handles for RAW QP (i.e. TIRN, TISN, RQN, SQN)
> >>   to be used later on directly by the DEVX interface.
> >>
> >> In General,
> >> Each object that is created/destroyed/modified via verbs will be stamped
> >> with a UID based on its user context. This is already done for DEVX objects
> >> commands.
> >>
> >> This will enable the firmware to enforce the usage of kernel objects
> >> from the DEVX flow by validating that the same UID is used and the 
> >> resources are
> >> really related to the same user.
> >>
> >> For example in case a CQ was created with verbs it will be stamped with
> >> UID and once will be pointed by a DEVX create QP command the firmware will
> >> validate that the input CQN really belongs to the UID which issues the 
> >> create QP
> >> command.
> >>
> >> As of the above, all the PRM objects (except of the public ones which
> >> are managed by the kernel e.g. FLOW, etc.) will have a UID upon their
> >> create/modify/destroy commands. The detection of UMEM / physical
> >> addressed in the relevant commands will be done by firmware according to a 
> >> 'umem
> >> valid bit' as the UID may be used in both cases.
> >>
> >> The series also enables white list commands which don't require a
> >> specific DEVX context, instead of this a device UID is used so that
> >> the firmware will mask un-privileged functionality. The IB link layer
> >> is also enabled once CAP_NET_RAW permission exists.
> >>
> >> To enable using the RAW QP underlay objects (e.g. TIRN, RQN, etc.) later
> >> on by DEVX commands the UHW output for this case was extended to return 
> >> this
> >> data when a DEVX context is used.
> >>
> >> Thanks
> >>
> >> Leon Romanovsky (1):
> >>   net/mlx5: Update mlx5_ifc with DEVX UID bits
> >>
> >> Yishai Hadas (24):
> >>   net/mlx5: Set uid as part of CQ commands
> >>   net/mlx5: Set uid as part of QP commands
> >>   net/mlx5: Set uid as part of RQ commands
> >>   net/mlx5: Set uid as part of SQ commands
> >>   net/mlx5: Set uid as part of SRQ commands
> >>   net/mlx5: Set uid as part of DCT commands
> >
> > Hi Doug and Jason,
> >
> > Do you want me to resend 7 patches above in one series and other patches
> > in another series just to be below 15 patches limit? Please be aware
> > that those patches above are going to mlx5-next and not to
> > net-next/rdma-next.
> >
> > No rebase, no code change, no much meaning too, but it is your call.
>
> how about yes! for stop shitting on Dave Miller?

Or, are you ok?

This series is not relevant to Dave Miller and he didn't even listed in CC or 
TO.

I still prefer to hear answer from respective maintainer to whom this
series was sent.

Thanks


Re: [PATCH ethtool] ethtool: support combinations of FEC modes

2018-09-17 Thread John W. Linville
On Wed, Sep 05, 2018 at 06:54:57PM +0100, Edward Cree wrote:
> Of the three drivers that currently support FEC configuration, two (sfc
>  and cxgb4[vf]) accept configurations with more than one bit set in the
>  feccmd.fec bitmask.  (The precise semantics of these combinations vary.)
> Thus, this patch adds the ability to specify such combinations through a
>  comma-separated list of FEC modes in the 'encoding' argument on the
>  command line.
> 
> Also adds --set-fec tests to test-cmdline.c, and corrects the man page
>  (the encoding argument is not optional) while updating it.
> 
> Signed-off-by: Edward Cree 
> ---
> I've CCed the maintainers of the other drivers (cxgb4, nfp) that support
>  --set-fec, in case they have opinions on this.
> I'm not totally happy with the man page changebar; it might be clearer
>  just to leave the comma-less version in the syntax synopsis and only
>  mention the commas in the running-text.

LGTM -- queued for next release...thanks!

John
 
>  ethtool.8.in   | 11 ---
>  ethtool.c  | 50 +++---
>  test-cmdline.c |  9 +
>  3 files changed, 56 insertions(+), 14 deletions(-)
> 
> diff --git a/ethtool.8.in b/ethtool.8.in
> index c8a902a..414eaa1 100644
> --- a/ethtool.8.in
> +++ b/ethtool.8.in
> @@ -389,7 +389,8 @@ ethtool \- query or control network driver and hardware 
> settings
>  .HP
>  .B ethtool \-\-set\-fec
>  .I devname
> -.B4 encoding auto off rs baser
> +.B encoding
> +.BR auto | off | rs | baser [ , ...]
>  .
>  .\" Adjust lines (i.e. full justification) and hyphenate.
>  .ad
> @@ -1119,8 +1120,12 @@ current FEC mode, the driver or firmware must take the 
> link down
>  administratively and report the problem in the system logs for users to 
> correct.
>  .RS 4
>  .TP
> -.A4 encoding auto off rs baser
> -Sets the FEC encoding for the device.
> +.BR encoding\ auto | off | rs | baser [ , ...]
> +
> +Sets the FEC encoding for the device.  Combinations of options are specified 
> as
> +e.g.
> +.B auto,rs
> +; the semantics of such combinations vary between drivers.
>  .TS
>  nokeep;
>  lB   l.
> diff --git a/ethtool.c b/ethtool.c
> index e8b7703..9997930 100644
> --- a/ethtool.c
> +++ b/ethtool.c
> @@ -4967,20 +4967,48 @@ static int do_set_phy_tunable(struct cmd_context *ctx)
>  
>  static int fecmode_str_to_type(const char *str)
>  {
> + if (!strcasecmp(str, "auto"))
> + return ETHTOOL_FEC_AUTO;
> + if (!strcasecmp(str, "off"))
> + return ETHTOOL_FEC_OFF;
> + if (!strcasecmp(str, "rs"))
> + return ETHTOOL_FEC_RS;
> + if (!strcasecmp(str, "baser"))
> + return ETHTOOL_FEC_BASER;
> +
> + return 0;
> +}
> +
> +/* Takes a comma-separated list of FEC modes, returns the bitwise OR of their
> + * corresponding ETHTOOL_FEC_* constants.
> + * Accepts repetitions (e.g. 'auto,auto') and trailing comma (e.g. 'off,').
> + */
> +static int parse_fecmode(const char *str)
> +{
>   int fecmode = 0;
> + char buf[6];
>  
>   if (!str)
> - return fecmode;
> -
> - if (!strcasecmp(str, "auto"))
> - fecmode |= ETHTOOL_FEC_AUTO;
> - else if (!strcasecmp(str, "off"))
> - fecmode |= ETHTOOL_FEC_OFF;
> - else if (!strcasecmp(str, "rs"))
> - fecmode |= ETHTOOL_FEC_RS;
> - else if (!strcasecmp(str, "baser"))
> - fecmode |= ETHTOOL_FEC_BASER;
> + return 0;
> + while (*str) {
> + size_t next;
> + int mode;
>  
> + next = strcspn(str, ",");
> + if (next >= 6) /* Bad mode, longest name is 5 chars */
> + return 0;
> + /* Copy into temp buffer and nul-terminate */
> + memcpy(buf, str, next);
> + buf[next] = 0;
> + mode = fecmode_str_to_type(buf);
> + if (!mode) /* Bad mode encountered */
> + return 0;
> + fecmode |= mode;
> + str += next;
> + /* Skip over ',' (but not nul) */
> + if (*str)
> + str++;
> + }
>   return fecmode;
>  }
>  
> @@ -5028,7 +5056,7 @@ static int do_sfec(struct cmd_context *ctx)
>   if (!fecmode_str)
>   exit_bad_args();
>  
> - fecmode = fecmode_str_to_type(fecmode_str);
> + fecmode = parse_fecmode(fecmode_str);
>   if (!fecmode)
>   exit_bad_args();
>  
> diff --git a/test-cmdline.c b/test-cmdline.c
> index a94edea..9c51cca 100644
> --- a/test-cmdline.c
> +++ b/test-cmdline.c
> @@ -266,6 +266,15 @@ static struct test_case {
>   { 0, "--set-eee devname tx-timer 42 advertise 0x4321" },
>   { 1, "--set-eee devname tx-timer foo" },
>   { 1, "--set-eee devname advertise foo" },
> + { 1, "--set-fec devname" },
> + { 0, "--set-fec devname encoding auto" },
> + { 0, "--set-fec devname encoding off," },
> + { 0, "--set-fec devname encoding baser,rs" },
> + { 0, 

Re: [bpf PATCH v2 2/3] bpf: sockmap, fix transition through disconnect without close

2018-09-17 Thread John Fastabend
On 09/17/2018 10:59 AM, John Fastabend wrote:
> It is possible (via shutdown()) for TCP socks to go trough TCP_CLOSE
> state via tcp_disconnect() without actually calling tcp_close which
> would then call our bpf_tcp_close() callback. Because of this a user
> could disconnect a socket then put it in a LISTEN state which would
> break our assumptions about sockets always being ESTABLISHED state.
> 
> To resolve this rely on the unhash hook, which is called in the
> disconnect case, to remove the sock from the sockmap.
> 

Sorry for the noise will need a v3 actually.

> Reported-by: Eric Dumazet 
> Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
> Signed-off-by: John Fastabend 
> ---
>  kernel/bpf/sockmap.c |   71 
> +-
>  1 file changed, 52 insertions(+), 19 deletions(-)

[...]


> +}
> +
> +static void bpf_tcp_unhash(struct sock *sk)
> +{
> + void (*unhash_fun)(struct sock *sk);
> + struct smap_psock *psock;
> +
> + rcu_read_lock();
> + psock = smap_psock_sk(sk);
> + if (unlikely(!psock)) {
> + rcu_read_unlock();
> + release_sock(sk);
 ^
> + return sk->sk_prot->unhash(sk);

if (sk->sk_prot->unhash) ...
else return;

Thanks,
John


Re: [PATCH rdma-next 00/24] Extend DEVX functionality

2018-09-17 Thread Or Gerlitz
On Mon, Sep 17, 2018 at 10:34 PM, Leon Romanovsky  wrote:
> On Mon, Sep 17, 2018 at 02:03:53PM +0300, Leon Romanovsky wrote:
>> From: Leon Romanovsky 
>>
>> From Yishai,
>>
>> This series comes to enable the DEVX functionality in some wider scope,
>> specifically,
>> - It enables using kernel objects that were created by the verbs
>>   API in the DEVX flow.
>> - It enables white list commands without DEVX user context.
>> - It enables the IB link layer under CAP_NET_RAW capabilities.
>> - It exposes the PRM handles for RAW QP (i.e. TIRN, TISN, RQN, SQN)
>>   to be used later on directly by the DEVX interface.
>>
>> In General,
>> Each object that is created/destroyed/modified via verbs will be stamped
>> with a UID based on its user context. This is already done for DEVX objects
>> commands.
>>
>> This will enable the firmware to enforce the usage of kernel objects
>> from the DEVX flow by validating that the same UID is used and the resources 
>> are
>> really related to the same user.
>>
>> For example in case a CQ was created with verbs it will be stamped with
>> UID and once will be pointed by a DEVX create QP command the firmware will
>> validate that the input CQN really belongs to the UID which issues the 
>> create QP
>> command.
>>
>> As of the above, all the PRM objects (except of the public ones which
>> are managed by the kernel e.g. FLOW, etc.) will have a UID upon their
>> create/modify/destroy commands. The detection of UMEM / physical
>> addressed in the relevant commands will be done by firmware according to a 
>> 'umem
>> valid bit' as the UID may be used in both cases.
>>
>> The series also enables white list commands which don't require a
>> specific DEVX context, instead of this a device UID is used so that
>> the firmware will mask un-privileged functionality. The IB link layer
>> is also enabled once CAP_NET_RAW permission exists.
>>
>> To enable using the RAW QP underlay objects (e.g. TIRN, RQN, etc.) later
>> on by DEVX commands the UHW output for this case was extended to return this
>> data when a DEVX context is used.
>>
>> Thanks
>>
>> Leon Romanovsky (1):
>>   net/mlx5: Update mlx5_ifc with DEVX UID bits
>>
>> Yishai Hadas (24):
>>   net/mlx5: Set uid as part of CQ commands
>>   net/mlx5: Set uid as part of QP commands
>>   net/mlx5: Set uid as part of RQ commands
>>   net/mlx5: Set uid as part of SQ commands
>>   net/mlx5: Set uid as part of SRQ commands
>>   net/mlx5: Set uid as part of DCT commands
>
> Hi Doug and Jason,
>
> Do you want me to resend 7 patches above in one series and other patches
> in another series just to be below 15 patches limit? Please be aware
> that those patches above are going to mlx5-next and not to
> net-next/rdma-next.
>
> No rebase, no code change, no much meaning too, but it is your call.

how about yes! for stop shitting on Dave Miller?


Re: [PATCH rdma-next 00/24] Extend DEVX functionality

2018-09-17 Thread Leon Romanovsky
On Mon, Sep 17, 2018 at 02:03:53PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky 
>
> From Yishai,
>
> This series comes to enable the DEVX functionality in some wider scope,
> specifically,
> - It enables using kernel objects that were created by the verbs
>   API in the DEVX flow.
> - It enables white list commands without DEVX user context.
> - It enables the IB link layer under CAP_NET_RAW capabilities.
> - It exposes the PRM handles for RAW QP (i.e. TIRN, TISN, RQN, SQN)
>   to be used later on directly by the DEVX interface.
>
> In General,
> Each object that is created/destroyed/modified via verbs will be stamped
> with a UID based on its user context. This is already done for DEVX objects
> commands.
>
> This will enable the firmware to enforce the usage of kernel objects
> from the DEVX flow by validating that the same UID is used and the resources 
> are
> really related to the same user.
>
> For example in case a CQ was created with verbs it will be stamped with
> UID and once will be pointed by a DEVX create QP command the firmware will
> validate that the input CQN really belongs to the UID which issues the create 
> QP
> command.
>
> As of the above, all the PRM objects (except of the public ones which
> are managed by the kernel e.g. FLOW, etc.) will have a UID upon their
> create/modify/destroy commands. The detection of UMEM / physical
> addressed in the relevant commands will be done by firmware according to a 
> 'umem
> valid bit' as the UID may be used in both cases.
>
> The series also enables white list commands which don't require a
> specific DEVX context, instead of this a device UID is used so that
> the firmware will mask un-privileged functionality. The IB link layer
> is also enabled once CAP_NET_RAW permission exists.
>
> To enable using the RAW QP underlay objects (e.g. TIRN, RQN, etc.) later
> on by DEVX commands the UHW output for this case was extended to return this
> data when a DEVX context is used.
>
> Thanks
>
> Leon Romanovsky (1):
>   net/mlx5: Update mlx5_ifc with DEVX UID bits
>
> Yishai Hadas (24):
>   net/mlx5: Set uid as part of CQ commands
>   net/mlx5: Set uid as part of QP commands
>   net/mlx5: Set uid as part of RQ commands
>   net/mlx5: Set uid as part of SQ commands
>   net/mlx5: Set uid as part of SRQ commands
>   net/mlx5: Set uid as part of DCT commands

Hi Doug and Jason,

Do you want me to resend 7 patches above in one series and other patches
in another series just to be below 15 patches limit? Please be aware
that those patches above are going to mlx5-next and not to
net-next/rdma-next.

No rebase, no code change, no much meaning too, but it is your call.

Thanks

>   IB/mlx5: Set uid as part of CQ creation
>   IB/mlx5: Set uid as part of QP creation
>   IB/mlx5: Set uid as part of RQ commands
>   IB/mlx5: Set uid as part of SQ commands
>   IB/mlx5: Set uid as part of TIR commands
>   IB/mlx5: Set uid as part of TIS commands
>   IB/mlx5: Set uid as part of RQT commands
>   IB/mlx5: Set uid as part of PD commands
>   IB/mlx5: Set uid as part of TD commands
>   IB/mlx5: Set uid as part of SRQ commands
>   IB/mlx5: Set uid as part of DCT commands
>   IB/mlx5: Set uid as part of XRCD commands
>   IB/mlx5: Set uid as part of MCG commands
>   IB/mlx5: Set valid umem bit on DEVX
>   IB/mlx5: Expose RAW QP device handles to user space
>   IB/mlx5: Manage device uid for DEVX white list commands
>   IB/mlx5: Enable DEVX white list commands
>   IB/mlx5: Enable DEVX on IB
>
>  drivers/infiniband/hw/mlx5/cmd.c  | 129 ++
>  drivers/infiniband/hw/mlx5/cmd.h  |  14 ++
>  drivers/infiniband/hw/mlx5/cq.c   |   1 +
>  drivers/infiniband/hw/mlx5/devx.c | 182 
> +++---
>  drivers/infiniband/hw/mlx5/main.c |  80 +++
>  drivers/infiniband/hw/mlx5/mlx5_ib.h  |  15 +--
>  drivers/infiniband/hw/mlx5/qp.c   | 141 +++-
>  drivers/infiniband/hw/mlx5/srq.c  |   1 +
>  drivers/net/ethernet/mellanox/mlx5/core/cq.c  |   4 +
>  drivers/net/ethernet/mellanox/mlx5/core/qp.c  |  81 
>  drivers/net/ethernet/mellanox/mlx5/core/srq.c |  30 -
>  include/linux/mlx5/cq.h   |   1 +
>  include/linux/mlx5/driver.h   |   1 +
>  include/linux/mlx5/mlx5_ifc.h | 135 +++
>  include/linux/mlx5/qp.h   |   1 +
>  include/linux/mlx5/srq.h  |   1 +
>  include/uapi/rdma/mlx5-abi.h  |  13 ++
>  17 files changed, 657 insertions(+), 173 deletions(-)
>
> --
> 2.14.4
>


signature.asc
Description: PGP signature


Re: [PATCH net] net/ipv4: defensive cipso option parsing

2018-09-17 Thread Nuernberger, Stefan
On Mon, 2018-09-17 at 12:35 -0400, Paul Moore wrote:
> On Mon, Sep 17, 2018 at 11:12 AM Stefan Nuernberger 
> wrote:
> > 
> > commit 40413955ee26 ("Cipso: cipso_v4_optptr enter infinite loop")
> > fixed
> > a possible infinite loop in the IP option parsing of CIPSO. The fix
> > assumes that ip_options_compile filtered out all zero length
> > options and
> > that no other one-byte options beside IPOPT_END and IPOPT_NOOP
> > exist.
> > While this assumption currently holds true, add explicit checks for
> > zero
> > length and invalid length options to be safe for the future. Even
> > though
> > ip_options_compile should have validated the options, the
> > introduction of
> > new one-byte options can still confuse this code without the
> > additional
> > checks.
> > 
> > Signed-off-by: Stefan Nuernberger 
> > Reviewed-by: David Woodhouse 
> > Reviewed-by: Simon Veith 
> > Cc: sta...@vger.kernel.org
> > ---
> >  net/ipv4/cipso_ipv4.c | 10 --
> >  1 file changed, 8 insertions(+), 2 deletions(-)
> > 
> > diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
> > index 82178cc69c96..f291b57b8474 100644
> > --- a/net/ipv4/cipso_ipv4.c
> > +++ b/net/ipv4/cipso_ipv4.c
> > @@ -1512,7 +1512,7 @@ static int cipso_v4_parsetag_loc(const struct
> > cipso_v4_doi *doi_def,
> >   *
> >   * Description:
> >   * Parse the packet's IP header looking for a CIPSO
> > option.  Returns a pointer
> > - * to the start of the CIPSO option on success, NULL if one if not
> > found.
> > + * to the start of the CIPSO option on success, NULL if one is not
> > found.
> >   *
> >   */
> >  unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
> > @@ -1522,9 +1522,11 @@ unsigned char *cipso_v4_optptr(const struct
> > sk_buff *skb)
> > int optlen;
> > int taglen;
> > 
> > -   for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen >
> > 0; ) {
> > +   for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen >
> > 1; ) {
> > switch (optptr[0]) {
> > case IPOPT_CIPSO:
> > +   if (!optptr[1] || optptr[1] > optlen)
> > +   return NULL;
> > return optptr;
> > case IPOPT_END:
> > return NULL;
> > @@ -1534,6 +1536,10 @@ unsigned char *cipso_v4_optptr(const struct
> > sk_buff *skb)
> > default:
> > taglen = optptr[1];
> > }
> > +
> > +   if (!taglen || taglen > optlen)
> > +   break;
> I tend to think that you reach a point where you simply need to trust
> that the stack is doing the right thing and that by the time you hit
> a
> certain point you can safely assume that the packet is well formed,
> but I'm not going to fight about that here.
> 
> Regardless of the above, I don't like how you're doing the option
> length check twice in this code, that looks ugly to me, I think we
> can
> do better.  How about something like this:
> 
>   for (...) {
> switch(optptr[0]) {
> case IPOPT_END:
>   return NULL;
> case IPOPT_NOOP:
>   taglen = 1;
> default:
>   taglen = optptr[1];
> }
> if (taglen == 0 || taglen > optlen)
>   return NULL;
> if (optptr[0] == IPOPT_CIPSO)
>   return optptr;
> 
>   }
> 

You're right, that looks much better. I sent around a new patch.

> > 
> > optlen -= taglen;
> > optptr += taglen;
> > }


Amazon Development Center Germany GmbH
Berlin - Dresden - Aachen
main office: Krausenstr. 38, 10117 Berlin
Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
Ust-ID: DE289237879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B


[bpf PATCH v2 3/3] bpf: test_maps, only support ESTABLISHED socks

2018-09-17 Thread John Fastabend
Ensure that sockets added to a sock{map|hash} that is not in the
ESTABLISHED state is rejected.

Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
Signed-off-by: John Fastabend 
---
 tools/testing/selftests/bpf/test_maps.c |   10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_maps.c 
b/tools/testing/selftests/bpf/test_maps.c
index 6f54f84..0f2090f 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -580,7 +580,11 @@ static void test_sockmap(int tasks, void *data)
/* Test update without programs */
for (i = 0; i < 6; i++) {
err = bpf_map_update_elem(fd, , [i], BPF_ANY);
-   if (err) {
+   if (i < 2 && !err) {
+   printf("Allowed update sockmap '%i:%i' not in 
ESTABLISHED\n",
+  i, sfd[i]);
+   goto out_sockmap;
+   } else if (i > 1 && err) {
printf("Failed noprog update sockmap '%i:%i'\n",
   i, sfd[i]);
goto out_sockmap;
@@ -741,7 +745,7 @@ static void test_sockmap(int tasks, void *data)
}
 
/* Test map update elem afterwards fd lives in fd and map_fd */
-   for (i = 0; i < 6; i++) {
+   for (i = 2; i < 6; i++) {
err = bpf_map_update_elem(map_fd_rx, , [i], BPF_ANY);
if (err) {
printf("Failed map_fd_rx update sockmap %i '%i:%i'\n",
@@ -845,7 +849,7 @@ static void test_sockmap(int tasks, void *data)
}
 
/* Delete the elems without programs */
-   for (i = 0; i < 6; i++) {
+   for (i = 2; i < 6; i++) {
err = bpf_map_delete_elem(fd, );
if (err) {
printf("Failed delete sockmap %i '%i:%i'\n",



[bpf PATCH v2 1/3] bpf: sockmap only allow ESTABLISHED sock state

2018-09-17 Thread John Fastabend
After this patch we only allow socks that are in ESTABLISHED state or
are being added via a sock_ops event that is transitioning into an
ESTABLISHED state. By allowing sock_ops events we allow users to
manage sockmaps directly from sock ops programs. The two supported
sock_ops ops are BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB and
BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB.

Similar to TLS ULP this ensures sk_user_data is correct.

Reported-by: Eric Dumazet 
Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
Signed-off-by: John Fastabend 
---
 kernel/bpf/sockmap.c |   31 ++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 488ef96..1f97b55 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -2097,8 +2097,12 @@ static int sock_map_update_elem(struct bpf_map *map,
return -EINVAL;
}
 
+   /* ULPs are currently supported only for TCP sockets in ESTABLISHED
+* state.
+*/
if (skops.sk->sk_type != SOCK_STREAM ||
-   skops.sk->sk_protocol != IPPROTO_TCP) {
+   skops.sk->sk_protocol != IPPROTO_TCP ||
+   skops.sk->sk_state != TCP_ESTABLISHED) {
fput(socket->file);
return -EOPNOTSUPP;
}
@@ -2453,6 +2457,16 @@ static int sock_hash_update_elem(struct bpf_map *map,
return -EINVAL;
}
 
+   /* ULPs are currently supported only for TCP sockets in ESTABLISHED
+* state.
+*/
+   if (skops.sk->sk_type != SOCK_STREAM ||
+   skops.sk->sk_protocol != IPPROTO_TCP ||
+   skops.sk->sk_state != TCP_ESTABLISHED) {
+   fput(socket->file);
+   return -EOPNOTSUPP;
+   }
+
lock_sock(skops.sk);
preempt_disable();
rcu_read_lock();
@@ -2543,10 +2557,22 @@ struct sock  *__sock_hash_lookup_elem(struct bpf_map 
*map, void *key)
.map_check_btf = map_check_no_btf,
 };
 
+static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops)
+{
+   return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
+  ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
+}
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
   struct bpf_map *, map, void *, key, u64, flags)
 {
WARN_ON_ONCE(!rcu_read_lock_held());
+
+   /* ULPs are currently supported only for TCP sockets in ESTABLISHED
+* state. This checks that the sock ops triggering the update is
+* one indicating we are (or will be soon) in an ESTABLISHED state.
+*/
+   if (!bpf_is_valid_sock_op(bpf_sock))
+   return -EOPNOTSUPP;
return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
 }
 
@@ -2565,6 +2591,9 @@ struct sock  *__sock_hash_lookup_elem(struct bpf_map 
*map, void *key)
   struct bpf_map *, map, void *, key, u64, flags)
 {
WARN_ON_ONCE(!rcu_read_lock_held());
+
+   if (!bpf_is_valid_sock_op(bpf_sock))
+   return -EOPNOTSUPP;
return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
 }
 



[bpf PATCH v2 2/3] bpf: sockmap, fix transition through disconnect without close

2018-09-17 Thread John Fastabend
It is possible (via shutdown()) for TCP socks to go trough TCP_CLOSE
state via tcp_disconnect() without actually calling tcp_close which
would then call our bpf_tcp_close() callback. Because of this a user
could disconnect a socket then put it in a LISTEN state which would
break our assumptions about sockets always being ESTABLISHED state.

To resolve this rely on the unhash hook, which is called in the
disconnect case, to remove the sock from the sockmap.

Reported-by: Eric Dumazet 
Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
Signed-off-by: John Fastabend 
---
 kernel/bpf/sockmap.c |   71 +-
 1 file changed, 52 insertions(+), 19 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 1f97b55..7deb362 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -132,6 +132,7 @@ struct smap_psock {
struct work_struct gc_work;
 
struct proto *sk_proto;
+   void (*save_unhash)(struct sock *sk);
void (*save_close)(struct sock *sk, long timeout);
void (*save_data_ready)(struct sock *sk);
void (*save_write_space)(struct sock *sk);
@@ -143,6 +144,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr 
*msg, size_t len,
 static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
+static void bpf_tcp_unhash(struct sock *sk);
 static void bpf_tcp_close(struct sock *sk, long timeout);
 
 static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
@@ -184,6 +186,7 @@ static void build_protos(struct proto 
prot[SOCKMAP_NUM_CONFIGS],
 struct proto *base)
 {
prot[SOCKMAP_BASE]  = *base;
+   prot[SOCKMAP_BASE].unhash   = bpf_tcp_unhash;
prot[SOCKMAP_BASE].close= bpf_tcp_close;
prot[SOCKMAP_BASE].recvmsg  = bpf_tcp_recvmsg;
prot[SOCKMAP_BASE].stream_memory_read   = bpf_tcp_stream_read;
@@ -217,6 +220,7 @@ static int bpf_tcp_init(struct sock *sk)
return -EBUSY;
}
 
+   psock->save_unhash = sk->sk_prot->unhash;
psock->save_close = sk->sk_prot->close;
psock->sk_proto = sk->sk_prot;
 
@@ -305,30 +309,12 @@ static struct smap_psock_map_entry *psock_map_pop(struct 
sock *sk,
return e;
 }
 
-static void bpf_tcp_close(struct sock *sk, long timeout)
+static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
 {
-   void (*close_fun)(struct sock *sk, long timeout);
struct smap_psock_map_entry *e;
struct sk_msg_buff *md, *mtmp;
-   struct smap_psock *psock;
struct sock *osk;
 
-   lock_sock(sk);
-   rcu_read_lock();
-   psock = smap_psock_sk(sk);
-   if (unlikely(!psock)) {
-   rcu_read_unlock();
-   release_sock(sk);
-   return sk->sk_prot->close(sk, timeout);
-   }
-
-   /* The psock may be destroyed anytime after exiting the RCU critial
-* section so by the time we use close_fun the psock may no longer
-* be valid. However, bpf_tcp_close is called with the sock lock
-* held so the close hook and sk are still valid.
-*/
-   close_fun = psock->save_close;
-
if (psock->cork) {
free_start_sg(psock->sock, psock->cork, true);
kfree(psock->cork);
@@ -379,6 +365,53 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
kfree(e);
e = psock_map_pop(sk, psock);
}
+}
+
+static void bpf_tcp_unhash(struct sock *sk)
+{
+   void (*unhash_fun)(struct sock *sk);
+   struct smap_psock *psock;
+
+   rcu_read_lock();
+   psock = smap_psock_sk(sk);
+   if (unlikely(!psock)) {
+   rcu_read_unlock();
+   release_sock(sk);
+   return sk->sk_prot->unhash(sk);
+   }
+
+   /* The psock may be destroyed anytime after exiting the RCU critial
+* section so by the time we use close_fun the psock may no longer
+* be valid. However, bpf_tcp_close is called with the sock lock
+* held so the close hook and sk are still valid.
+*/
+   unhash_fun = psock->save_unhash;
+   bpf_tcp_remove(sk, psock);
+   rcu_read_unlock();
+   unhash_fun(sk);
+}
+
+static void bpf_tcp_close(struct sock *sk, long timeout)
+{
+   void (*close_fun)(struct sock *sk, long timeout);
+   struct smap_psock *psock;
+
+   lock_sock(sk);
+   rcu_read_lock();
+   psock = smap_psock_sk(sk);
+   if (unlikely(!psock)) {
+   rcu_read_unlock();
+   release_sock(sk);
+   return sk->sk_prot->close(sk, timeout);
+   }
+
+   /* The psock may be destroyed anytime after exiting the RCU critial
+* section so by the time we use 

[bpf PATCH v2 0/3] bpf, sockmap ESTABLISHED state only

2018-09-17 Thread John Fastabend
Eric noted that using the close callback is not sufficient
to catch all transitions from ESTABLISHED state to a LISTEN
state. So this series does two things. First, only allow
adding socks in ESTABLISH state and second use unhash callback
to catch tcp_disconnect() transitions.

v2: Added check for ESTABLISH state in hash update sockmap as well.

Thanks,
John

---

John Fastabend (3):
  bpf: sockmap only allow ESTABLISHED sock state
  bpf: sockmap, fix transition through disconnect without close
  bpf: test_maps, only support ESTABLISHED socks


 kernel/bpf/sockmap.c|  102 +--
 tools/testing/selftests/bpf/test_maps.c |   10 ++-
 2 files changed, 89 insertions(+), 23 deletions(-)

--
Signature


[PATCH v2 net] net/ipv4: defensive cipso option parsing

2018-09-17 Thread Stefan Nuernberger
commit 40413955ee26 ("Cipso: cipso_v4_optptr enter infinite loop") fixed
a possible infinite loop in the IP option parsing of CIPSO. The fix
assumes that ip_options_compile filtered out all zero length options and
that no other one-byte options beside IPOPT_END and IPOPT_NOOP exist.
While this assumption currently holds true, add explicit checks for zero
length and invalid length options to be safe for the future. Even though
ip_options_compile should have validated the options, the introduction of
new one-byte options can still confuse this code without the additional
checks.

Signed-off-by: Stefan Nuernberger 
Cc: David Woodhouse 
Cc: Simon Veith 
Cc: sta...@vger.kernel.org
---
 net/ipv4/cipso_ipv4.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 82178cc69c96..777fa3b7fb13 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1512,7 +1512,7 @@ static int cipso_v4_parsetag_loc(const struct 
cipso_v4_doi *doi_def,
  *
  * Description:
  * Parse the packet's IP header looking for a CIPSO option.  Returns a pointer
- * to the start of the CIPSO option on success, NULL if one if not found.
+ * to the start of the CIPSO option on success, NULL if one is not found.
  *
  */
 unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
@@ -1522,10 +1522,8 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
int optlen;
int taglen;
 
-   for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) {
+   for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) {
switch (optptr[0]) {
-   case IPOPT_CIPSO:
-   return optptr;
case IPOPT_END:
return NULL;
case IPOPT_NOOP:
@@ -1534,6 +1532,11 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
default:
taglen = optptr[1];
}
+   if (!taglen || taglen > optlen)
+   return NULL;
+   if (optptr[0] == IPOPT_CIPSO)
+   return optptr;
+
optlen -= taglen;
optptr += taglen;
}
-- 
2.19.0



[bpf PATCH 3/3] bpf: test_maps, only support ESTABLISHED socks

2018-09-17 Thread John Fastabend
Ensure that sockets added to a sock{map|hash} that is not in the
ESTABLISHED state is rejected.

Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
Signed-off-by: John Fastabend 
---
 tools/testing/selftests/bpf/test_maps.c |   10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_maps.c 
b/tools/testing/selftests/bpf/test_maps.c
index 6f54f84..0f2090f 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -580,7 +580,11 @@ static void test_sockmap(int tasks, void *data)
/* Test update without programs */
for (i = 0; i < 6; i++) {
err = bpf_map_update_elem(fd, , [i], BPF_ANY);
-   if (err) {
+   if (i < 2 && !err) {
+   printf("Allowed update sockmap '%i:%i' not in 
ESTABLISHED\n",
+  i, sfd[i]);
+   goto out_sockmap;
+   } else if (i > 1 && err) {
printf("Failed noprog update sockmap '%i:%i'\n",
   i, sfd[i]);
goto out_sockmap;
@@ -741,7 +745,7 @@ static void test_sockmap(int tasks, void *data)
}
 
/* Test map update elem afterwards fd lives in fd and map_fd */
-   for (i = 0; i < 6; i++) {
+   for (i = 2; i < 6; i++) {
err = bpf_map_update_elem(map_fd_rx, , [i], BPF_ANY);
if (err) {
printf("Failed map_fd_rx update sockmap %i '%i:%i'\n",
@@ -845,7 +849,7 @@ static void test_sockmap(int tasks, void *data)
}
 
/* Delete the elems without programs */
-   for (i = 0; i < 6; i++) {
+   for (i = 2; i < 6; i++) {
err = bpf_map_delete_elem(fd, );
if (err) {
printf("Failed delete sockmap %i '%i:%i'\n",



[bpf PATCH 2/3] bpf: sockmap, fix transition through disconnect without close

2018-09-17 Thread John Fastabend
It is possible (via shutdown()) for TCP socks to go trough TCP_CLOSE
state via tcp_disconnect() without actually calling tcp_close which
would then call our bpf_tcp_close() callback. Because of this a user
could disconnect a socket then put it in a LISTEN state which would
break our assumptions about sockets always being ESTABLISHED state.

To resolve this rely on the unhash hook, which is called in the
disconnect case, to remove the sock from the sockmap.

Reported-by: Eric Dumazet 
Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
Signed-off-by: John Fastabend 
---
 kernel/bpf/sockmap.c |   71 +-
 1 file changed, 52 insertions(+), 19 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 998b7bd..f6ab7f3 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -132,6 +132,7 @@ struct smap_psock {
struct work_struct gc_work;
 
struct proto *sk_proto;
+   void (*save_unhash)(struct sock *sk);
void (*save_close)(struct sock *sk, long timeout);
void (*save_data_ready)(struct sock *sk);
void (*save_write_space)(struct sock *sk);
@@ -143,6 +144,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr 
*msg, size_t len,
 static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
+static void bpf_tcp_unhash(struct sock *sk);
 static void bpf_tcp_close(struct sock *sk, long timeout);
 
 static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
@@ -184,6 +186,7 @@ static void build_protos(struct proto 
prot[SOCKMAP_NUM_CONFIGS],
 struct proto *base)
 {
prot[SOCKMAP_BASE]  = *base;
+   prot[SOCKMAP_BASE].unhash   = bpf_tcp_unhash;
prot[SOCKMAP_BASE].close= bpf_tcp_close;
prot[SOCKMAP_BASE].recvmsg  = bpf_tcp_recvmsg;
prot[SOCKMAP_BASE].stream_memory_read   = bpf_tcp_stream_read;
@@ -217,6 +220,7 @@ static int bpf_tcp_init(struct sock *sk)
return -EBUSY;
}
 
+   psock->save_unhash = sk->sk_prot->unhash;
psock->save_close = sk->sk_prot->close;
psock->sk_proto = sk->sk_prot;
 
@@ -305,30 +309,12 @@ static struct smap_psock_map_entry *psock_map_pop(struct 
sock *sk,
return e;
 }
 
-static void bpf_tcp_close(struct sock *sk, long timeout)
+static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
 {
-   void (*close_fun)(struct sock *sk, long timeout);
struct smap_psock_map_entry *e;
struct sk_msg_buff *md, *mtmp;
-   struct smap_psock *psock;
struct sock *osk;
 
-   lock_sock(sk);
-   rcu_read_lock();
-   psock = smap_psock_sk(sk);
-   if (unlikely(!psock)) {
-   rcu_read_unlock();
-   release_sock(sk);
-   return sk->sk_prot->close(sk, timeout);
-   }
-
-   /* The psock may be destroyed anytime after exiting the RCU critial
-* section so by the time we use close_fun the psock may no longer
-* be valid. However, bpf_tcp_close is called with the sock lock
-* held so the close hook and sk are still valid.
-*/
-   close_fun = psock->save_close;
-
if (psock->cork) {
free_start_sg(psock->sock, psock->cork, true);
kfree(psock->cork);
@@ -379,6 +365,53 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
kfree(e);
e = psock_map_pop(sk, psock);
}
+}
+
+static void bpf_tcp_unhash(struct sock *sk)
+{
+   void (*unhash_fun)(struct sock *sk);
+   struct smap_psock *psock;
+
+   rcu_read_lock();
+   psock = smap_psock_sk(sk);
+   if (unlikely(!psock)) {
+   rcu_read_unlock();
+   release_sock(sk);
+   return sk->sk_prot->unhash(sk);
+   }
+
+   /* The psock may be destroyed anytime after exiting the RCU critial
+* section so by the time we use close_fun the psock may no longer
+* be valid. However, bpf_tcp_close is called with the sock lock
+* held so the close hook and sk are still valid.
+*/
+   unhash_fun = psock->save_unhash;
+   bpf_tcp_remove(sk, psock);
+   rcu_read_unlock();
+   unhash_fun(sk);
+}
+
+static void bpf_tcp_close(struct sock *sk, long timeout)
+{
+   void (*close_fun)(struct sock *sk, long timeout);
+   struct smap_psock *psock;
+
+   lock_sock(sk);
+   rcu_read_lock();
+   psock = smap_psock_sk(sk);
+   if (unlikely(!psock)) {
+   rcu_read_unlock();
+   release_sock(sk);
+   return sk->sk_prot->close(sk, timeout);
+   }
+
+   /* The psock may be destroyed anytime after exiting the RCU critial
+* section so by the time we use 

[bpf PATCH 0/3] bpf, sockmap ESTABLISHED state only

2018-09-17 Thread John Fastabend
Eric noted that using the close callback is not sufficient
to catch all transitions from ESTABLISHED state to a LISTEN
state. So this series does two things. First, only allow
adding socks in ESTABLISH state and second use unhash callback
to catch tcp_disconnect() transitions.

Thanks,
John


---

John Fastabend (3):
  bpf: sockmap only allow ESTABLISHED sock state
  bpf: sockmap, fix transition through disconnect without close
  bpf: test_maps, only support ESTABLISHED socks


 kernel/bpf/sockmap.c|   92 ---
 tools/testing/selftests/bpf/test_maps.c |   10 ++-
 2 files changed, 79 insertions(+), 23 deletions(-)

--
Signature


[bpf PATCH 1/3] bpf: sockmap only allow ESTABLISHED sock state

2018-09-17 Thread John Fastabend
After this patch we only allow socks that are in ESTABLISHED state or
are being added via a sock_ops event that is transitioning into an
ESTABLISHED state. By allowing sock_ops events we allow users to
manage sockmaps directly from sock ops programs. The two supported
sock_ops ops are BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB and
BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB.

Similar to TLS ULP this ensures sk_user_data is correct.

Reported-by: Eric Dumazet 
Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
Signed-off-by: John Fastabend 
---
 kernel/bpf/sockmap.c |   21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 488ef96..998b7bd 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -2097,8 +2097,12 @@ static int sock_map_update_elem(struct bpf_map *map,
return -EINVAL;
}
 
+   /* ULPs are currently supported only for TCP sockets in ESTABLISHED
+* state.
+*/
if (skops.sk->sk_type != SOCK_STREAM ||
-   skops.sk->sk_protocol != IPPROTO_TCP) {
+   skops.sk->sk_protocol != IPPROTO_TCP ||
+   skops.sk->sk_state != TCP_ESTABLISHED) {
fput(socket->file);
return -EOPNOTSUPP;
}
@@ -2543,10 +2547,22 @@ struct sock  *__sock_hash_lookup_elem(struct bpf_map 
*map, void *key)
.map_check_btf = map_check_no_btf,
 };
 
+static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops)
+{
+   return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
+  ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
+}
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
   struct bpf_map *, map, void *, key, u64, flags)
 {
WARN_ON_ONCE(!rcu_read_lock_held());
+
+   /* ULPs are currently supported only for TCP sockets in ESTABLISHED
+* state. This checks that the sock ops triggering the update is
+* one indicating we are (or will be soon) in an ESTABLISHED state.
+*/
+   if (!bpf_is_valid_sock_op(bpf_sock))
+   return -EOPNOTSUPP;
return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
 }
 
@@ -2565,6 +2581,9 @@ struct sock  *__sock_hash_lookup_elem(struct bpf_map 
*map, void *key)
   struct bpf_map *, map, void *, key, u64, flags)
 {
WARN_ON_ONCE(!rcu_read_lock_held());
+
+   if (!bpf_is_valid_sock_op(bpf_sock))
+   return -EOPNOTSUPP;
return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
 }
 



Re: [Patch net v2] net/ipv6: do not copy dst flags on rt init

2018-09-17 Thread David Ahern
On 9/17/18 10:20 AM, Peter Oskolkov wrote:
> DST_NOCOUNT in dst_entry::flags tracks whether the entry counts
> toward route cache size (net->ipv6.sysctl.ip6_rt_max_size).
> 
> If the flag is NOT set, dst_ops::pcpuc_entries counter is incremented
> in dist_init() and decremented in dst_destroy().
> 
> This flag is tied to allocation/deallocation of dst_entry and
> should not be copied from another dst/route. Otherwise it can happen
> that dst_ops::pcpuc_entries counter grows until no new routes can
> be allocated because the counter reached ip6_rt_max_size due to
> DST_NOCOUNT not set and thus no counter decrements on gc-ed routes.
> 
> Fixes: 3b6761d18bc1 ("net/ipv6: Move dst flags to booleans in fib entries")
> Cc: David Ahern 
> Acked-by: Wei Wang 
> Signed-off-by: Peter Oskolkov 
> ---
>  net/ipv6/route.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 3eed045c65a5..480a79f47c52 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -946,8 +946,6 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, 
> struct fib6_info *ort)
>  
>  static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
>  {
> - rt->dst.flags |= fib6_info_dst_flags(ort);
> -
>   if (ort->fib6_flags & RTF_REJECT) {
>   ip6_rt_init_dst_reject(rt, ort);
>   return;
> 

Reviewed-by: David Ahern 

Thanks for the patch.


Re: [PATCH net] net/ipv6: do not copy DST_NOCOUNT flag on rt init

2018-09-17 Thread Peter Oskolkov
On Mon, Sep 17, 2018 at 9:59 AM David Ahern  wrote:
>
> On 9/17/18 9:11 AM, Peter Oskolkov wrote:
> > On Thu, Sep 13, 2018 at 9:11 PM David Ahern  wrote:
> >>
> >> On 9/13/18 1:38 PM, Peter Oskolkov wrote:
> >>
> >>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> >>> index 3eed045c65a5..a3902f805305 100644
> >>> --- a/net/ipv6/route.c
> >>> +++ b/net/ipv6/route.c
> >>> @@ -946,7 +946,7 @@ static void ip6_rt_init_dst_reject(struct rt6_info 
> >>> *rt, struct fib6_info *ort)
> >>>
> >>>  static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
> >>>  {
> >>> - rt->dst.flags |= fib6_info_dst_flags(ort);
> >>> + rt->dst.flags |= fib6_info_dst_flags(ort) & ~DST_NOCOUNT;
> >>
> >> I think my mistake is setting dst.flags in ip6_rt_init_dst. Flags
> >> argument is passed to ip6_dst_alloc which is always invoked before
> >> ip6_rt_copy_init is called which is the only caller of ip6_rt_init_dst.
> >
> > ip6_rt_cache_alloc calls ip6_dst_alloc with zero as flags; and only
> > one flag is copied later (DST_HOST) outside of ip6_rt_init_dst().
> > If the flag assignment is completely removed from ip6_rt_init_dst(),
> > then DST_NOPOLICY flag will be lost.
> >
> > Which may be OK, but is more than what this patch tries to solve (do not
> > copy DST_NOCOUNT flag).
>
> In the 4.17 kernel (prior to the fib6_info change), ip6_rt_cache_alloc
> calls __ip6_dst_alloc with 0 for flags so this is correct. The mistake
> is ip6_rt_copy_init -> ip6_rt_init_dst -> fib6_info_dst_flags.
>
> I believe the right fix is to drop the 'rt->dst.flags |=
> fib6_info_dst_flags(ort);' from ip6_rt_init_dst.

OK, I sent a v2 with the assignment removed. Thanks for the review!


[Patch net v2] net/ipv6: do not copy dst flags on rt init

2018-09-17 Thread Peter Oskolkov
DST_NOCOUNT in dst_entry::flags tracks whether the entry counts
toward route cache size (net->ipv6.sysctl.ip6_rt_max_size).

If the flag is NOT set, dst_ops::pcpuc_entries counter is incremented
in dist_init() and decremented in dst_destroy().

This flag is tied to allocation/deallocation of dst_entry and
should not be copied from another dst/route. Otherwise it can happen
that dst_ops::pcpuc_entries counter grows until no new routes can
be allocated because the counter reached ip6_rt_max_size due to
DST_NOCOUNT not set and thus no counter decrements on gc-ed routes.

Fixes: 3b6761d18bc1 ("net/ipv6: Move dst flags to booleans in fib entries")
Cc: David Ahern 
Acked-by: Wei Wang 
Signed-off-by: Peter Oskolkov 
---
 net/ipv6/route.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3eed045c65a5..480a79f47c52 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -946,8 +946,6 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, 
struct fib6_info *ort)
 
 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
 {
-   rt->dst.flags |= fib6_info_dst_flags(ort);
-
if (ort->fib6_flags & RTF_REJECT) {
ip6_rt_init_dst_reject(rt, ort);
return;
-- 
2.19.0.397.gdd90340f6a-goog



iproute2: fail to add fdb entries to ipv6 vxlan device

2018-09-17 Thread Lorenzo Bianconi
Hi all,

while working on IPv6 vlxan driver I figured out that with recent version
of iproute2 it is no longer possible to configure an IPv6 vxlan device without
endpoint info (local ip, remote ip or group ip) and later add entries in
the vxlan fdb. This issue can be triggered with the following reproducer:

$ip -6 link add vxlan0 type vxlan id 42 dev enp0s2 \
proxy nolearning l2miss l3miss
$bridge fdb add 46:47:1f:a7:1c:25 dev vxlan1 dst 2000::2
RTNETLINK answers: Address family not supported by protocol

Starting from commit 1e9b8072de2c ("iplink_vxlan: Get rid of inet_get_addr()")
the preferred_family is no longer taken into account if neither saddr or daddr
are provided and vxlan kernel module will use IPv4 as default remote inet
family neglecting the one provided by userspace.
I guess we can fix that issue in two ways:

1- add a new netlink attribute to vxlan driver in order to specify the
preferred_family to use

2- restore the behaviour introduced in commit 97d564b90ccb ("vxlan: use
preferred address family when neither group or remote is specified") with
the following patch

-- >8 --
diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
index 2bc253fc..831f39a2 100644
--- a/ip/iplink_vxlan.c
+++ b/ip/iplink_vxlan.c
@@ -82,6 +82,7 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, 
char **argv,
__u64 attrs = 0;
bool set_op = (n->nlmsg_type == RTM_NEWLINK &&
   !(n->nlmsg_flags & NLM_F_CREATE));
+   bool selected_family = false;
 
saddr.family = daddr.family = AF_UNSPEC;
 
@@ -356,12 +357,26 @@ static int vxlan_parse_opt(struct link_util *lu, int 
argc, char **argv,
int type = (saddr.family == AF_INET) ? IFLA_VXLAN_LOCAL
 : IFLA_VXLAN_LOCAL6;
addattr_l(n, 1024, type, saddr.data, saddr.bytelen);
+   selected_family = true;
}
 
if (is_addrtype_inet()) {
int type = (daddr.family == AF_INET) ? IFLA_VXLAN_GROUP
 : IFLA_VXLAN_GROUP6;
addattr_l(n, 1024, type, daddr.data, daddr.bytelen);
+   selected_family = true;
+   }
+
+   if (!selected_family) {
+   if (preferred_family == AF_INET) {
+   get_addr(, "default", AF_INET);
+   addattr_l(n, 1024, IFLA_VXLAN_GROUP,
+ daddr.data, daddr.bytelen);
+   } else if (preferred_family == AF_INET6) {
+   get_addr(, "default", AF_INET6);
+   addattr_l(n, 1024, IFLA_VXLAN_GROUP6,
+ daddr.data, daddr.bytelen);
+   }
}
 
if (!set_op || VXLAN_ATTRSET(attrs, IFLA_VXLAN_LEARNING))
-- 8< --

What is the best way to proceed?

Regards,
Lorenzo


Re: [PATCH net] net/ipv6: do not copy DST_NOCOUNT flag on rt init

2018-09-17 Thread David Ahern
On 9/17/18 9:11 AM, Peter Oskolkov wrote:
> On Thu, Sep 13, 2018 at 9:11 PM David Ahern  wrote:
>>
>> On 9/13/18 1:38 PM, Peter Oskolkov wrote:
>>
>>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>>> index 3eed045c65a5..a3902f805305 100644
>>> --- a/net/ipv6/route.c
>>> +++ b/net/ipv6/route.c
>>> @@ -946,7 +946,7 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, 
>>> struct fib6_info *ort)
>>>
>>>  static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
>>>  {
>>> - rt->dst.flags |= fib6_info_dst_flags(ort);
>>> + rt->dst.flags |= fib6_info_dst_flags(ort) & ~DST_NOCOUNT;
>>
>> I think my mistake is setting dst.flags in ip6_rt_init_dst. Flags
>> argument is passed to ip6_dst_alloc which is always invoked before
>> ip6_rt_copy_init is called which is the only caller of ip6_rt_init_dst.
> 
> ip6_rt_cache_alloc calls ip6_dst_alloc with zero as flags; and only
> one flag is copied later (DST_HOST) outside of ip6_rt_init_dst().
> If the flag assignment is completely removed from ip6_rt_init_dst(),
> then DST_NOPOLICY flag will be lost.
> 
> Which may be OK, but is more than what this patch tries to solve (do not
> copy DST_NOCOUNT flag).

In the 4.17 kernel (prior to the fib6_info change), ip6_rt_cache_alloc
calls __ip6_dst_alloc with 0 for flags so this is correct. The mistake
is ip6_rt_copy_init -> ip6_rt_init_dst -> fib6_info_dst_flags.

I believe the right fix is to drop the 'rt->dst.flags |=
fib6_info_dst_flags(ort);' from ip6_rt_init_dst.


Re: [PATCH v2 2/4] dt-bindings: net: qcom: Add binding for shared mdio bus

2018-09-17 Thread Florian Fainelli
On 09/17/2018 09:47 AM, Wang, Dongsheng wrote:
> On 9/17/2018 10:50 PM, Andrew Lunn wrote:
>> On Mon, Sep 17, 2018 at 04:53:29PM +0800, Wang Dongsheng wrote:
>>> This property copy from "ibm,emac.txt" to describe a shared MIDO bus.
>>> Since emac include MDIO, so If the motherboard has more than one PHY
>>> connected to an MDIO bus, this property will point to the MAC device
>>> that has the MDIO bus.
>>>
>>> Signed-off-by: Wang Dongsheng 
>>> ---
>>> V2: s/Since QDF2400 emac/Since emac/
>>> ---
>>>  Documentation/devicetree/bindings/net/qcom-emac.txt | 4 
>>>  1 file changed, 4 insertions(+)
>>>
>>> diff --git a/Documentation/devicetree/bindings/net/qcom-emac.txt 
>>> b/Documentation/devicetree/bindings/net/qcom-emac.txt
>>> index 346e6c7f47b7..50db71771358 100644
>>> --- a/Documentation/devicetree/bindings/net/qcom-emac.txt
>>> +++ b/Documentation/devicetree/bindings/net/qcom-emac.txt
>>> @@ -24,6 +24,9 @@ Internal PHY node:
>>>  The external phy child node:
>>>  - reg : The phy address
>>>  
>>> +Optional properties:
>>> +- mdio-device : Shared MIDO bus.
>> Hi Dongsheng
>>
>> I don't see why you need this property. The ethernet interface has a
>> phy-handle which points to a PHY. That is all you need to find the PHY.
> phy-handle is description PHY address. This property is describing an
> MDIO controller.
> Each QCOM emac include an MDIO controller, normally each emac only
> connect one
> PHY device, but when all of the PHY devices mdio lines connect one MDIO
> controller
> that is included in EMAC, we need to share this MDIO controller for
> others EMAC.

If you want to describe the MDIO controller, then you embed a mdio
subnode into your Ethernet MAC node:

 emac0: ethernet@feb2 {
mdio {
#address-cells = <1>;
#size-cells = <0>;

phy0: ethernet-phy@0 {
reg = <0>;
};
};
};

And then each Ethernet MAC controller refers to their appropriate PHY
device tree node using a phy-handle property to point to either their
own MDIO controller, or another MAC's MDIO controller.

The IBM Emac is a old (not to say bad) example and it does not use the
PHY library and the standard Device Tree node property, please don't use
it as a reference.
-- 
Florian


Re: [PATCH v2 2/4] dt-bindings: net: qcom: Add binding for shared mdio bus

2018-09-17 Thread Wang, Dongsheng
On 9/17/2018 10:50 PM, Andrew Lunn wrote:
> On Mon, Sep 17, 2018 at 04:53:29PM +0800, Wang Dongsheng wrote:
>> This property copy from "ibm,emac.txt" to describe a shared MIDO bus.
>> Since emac include MDIO, so If the motherboard has more than one PHY
>> connected to an MDIO bus, this property will point to the MAC device
>> that has the MDIO bus.
>>
>> Signed-off-by: Wang Dongsheng 
>> ---
>> V2: s/Since QDF2400 emac/Since emac/
>> ---
>>  Documentation/devicetree/bindings/net/qcom-emac.txt | 4 
>>  1 file changed, 4 insertions(+)
>>
>> diff --git a/Documentation/devicetree/bindings/net/qcom-emac.txt 
>> b/Documentation/devicetree/bindings/net/qcom-emac.txt
>> index 346e6c7f47b7..50db71771358 100644
>> --- a/Documentation/devicetree/bindings/net/qcom-emac.txt
>> +++ b/Documentation/devicetree/bindings/net/qcom-emac.txt
>> @@ -24,6 +24,9 @@ Internal PHY node:
>>  The external phy child node:
>>  - reg : The phy address
>>  
>> +Optional properties:
>> +- mdio-device : Shared MIDO bus.
> Hi Dongsheng
>
> I don't see why you need this property. The ethernet interface has a
> phy-handle which points to a PHY. That is all you need to find the PHY.
phy-handle is description PHY address. This property is describing an
MDIO controller.
Each QCOM emac include an MDIO controller, normally each emac only
connect one
PHY device, but when all of the PHY devices mdio lines connect one MDIO
controller
that is included in EMAC, we need to share this MDIO controller for
others EMAC.

Normally:

(MDIO)
MAC0 ---PHY0
 |   |
 |  (DATA) |
 -


(MDIO)
MAC1 ---PHY1
 |   |
 |  (DATA) |
 -



Shared MDIO bus: "mdio-device" = , MAC1 will get MAC0's MDIO bus
and also get the corresponding PHY device.

(DATA)
MAC0 ---PHY0
 |   |
 |  (MDIO) |
 
 |
MAC1PHY0
 |   |
 |  (DATA) |
 

Cheers,
Dongsheng


> emac0: ethernet@feb2 {
> compatible = "qcom,fsm9900-emac";
> reg = <0xfeb2 0x1>,
>   <0xfeb36000 0x1000>;
> interrupts = <76>;
>
> clocks = < 0>, < 1>, < 3>, < 4>, < 5>,
> < 6>, < 7>;
> clock-names = "axi_clk", "cfg_ahb_clk", "high_speed_clk",
> "mdio_clk", "tx_clk", "rx_clk", "sys_clk";
>
> internal-phy = <_sgmii>;
>
> phy-handle = <>;
>
> #address-cells = <1>;
> #size-cells = <0>;
>
> phy0: ethernet-phy@0 {
> reg = <0>;
> };
>
> phy1: ethernet-phy@1 {
> reg = <1>;
> };
>
> pinctrl-names = "default";
> pinctrl-0 = <_pins_a>;
> };
>
> emac1: ethernet@3890 {
> compatible = "qcom,fsm9900-emac";
>   ...
>   ...
>
> phy-handle = <>;
>   };
>
>   Andrew
>



Re: [PATCH net] net/ipv4: defensive cipso option parsing

2018-09-17 Thread Paul Moore
On Mon, Sep 17, 2018 at 11:12 AM Stefan Nuernberger  wrote:
> commit 40413955ee26 ("Cipso: cipso_v4_optptr enter infinite loop") fixed
> a possible infinite loop in the IP option parsing of CIPSO. The fix
> assumes that ip_options_compile filtered out all zero length options and
> that no other one-byte options beside IPOPT_END and IPOPT_NOOP exist.
> While this assumption currently holds true, add explicit checks for zero
> length and invalid length options to be safe for the future. Even though
> ip_options_compile should have validated the options, the introduction of
> new one-byte options can still confuse this code without the additional
> checks.
>
> Signed-off-by: Stefan Nuernberger 
> Reviewed-by: David Woodhouse 
> Reviewed-by: Simon Veith 
> Cc: sta...@vger.kernel.org
> ---
>  net/ipv4/cipso_ipv4.c | 10 --
>  1 file changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
> index 82178cc69c96..f291b57b8474 100644
> --- a/net/ipv4/cipso_ipv4.c
> +++ b/net/ipv4/cipso_ipv4.c
> @@ -1512,7 +1512,7 @@ static int cipso_v4_parsetag_loc(const struct 
> cipso_v4_doi *doi_def,
>   *
>   * Description:
>   * Parse the packet's IP header looking for a CIPSO option.  Returns a 
> pointer
> - * to the start of the CIPSO option on success, NULL if one if not found.
> + * to the start of the CIPSO option on success, NULL if one is not found.
>   *
>   */
>  unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
> @@ -1522,9 +1522,11 @@ unsigned char *cipso_v4_optptr(const struct sk_buff 
> *skb)
> int optlen;
> int taglen;
>
> -   for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) {
> +   for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) {
> switch (optptr[0]) {
> case IPOPT_CIPSO:
> +   if (!optptr[1] || optptr[1] > optlen)
> +   return NULL;
> return optptr;
> case IPOPT_END:
> return NULL;
> @@ -1534,6 +1536,10 @@ unsigned char *cipso_v4_optptr(const struct sk_buff 
> *skb)
> default:
> taglen = optptr[1];
> }
> +
> +   if (!taglen || taglen > optlen)
> +   break;

I tend to think that you reach a point where you simply need to trust
that the stack is doing the right thing and that by the time you hit a
certain point you can safely assume that the packet is well formed,
but I'm not going to fight about that here.

Regardless of the above, I don't like how you're doing the option
length check twice in this code, that looks ugly to me, I think we can
do better.  How about something like this:

  for (...) {
switch(optptr[0]) {
case IPOPT_END:
  return NULL;
case IPOPT_NOOP:
  taglen = 1;
default:
  taglen = optptr[1];
}
if (taglen == 0 || taglen > optlen)
  return NULL;
if (optptr[0] == IPOPT_CIPSO)
  return optptr;

  }

> optlen -= taglen;
> optptr += taglen;
> }

-- 
paul moore
www.paul-moore.com


Re: [PATCH net] net/ipv6: do not copy DST_NOCOUNT flag on rt init

2018-09-17 Thread David Ahern
On 9/17/18 9:11 AM, Peter Oskolkov wrote:
> On Thu, Sep 13, 2018 at 9:11 PM David Ahern  wrote:
>>
>> On 9/13/18 1:38 PM, Peter Oskolkov wrote:
>>
>>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>>> index 3eed045c65a5..a3902f805305 100644
>>> --- a/net/ipv6/route.c
>>> +++ b/net/ipv6/route.c
>>> @@ -946,7 +946,7 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, 
>>> struct fib6_info *ort)
>>>
>>>  static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
>>>  {
>>> - rt->dst.flags |= fib6_info_dst_flags(ort);
>>> + rt->dst.flags |= fib6_info_dst_flags(ort) & ~DST_NOCOUNT;
>>
>> I think my mistake is setting dst.flags in ip6_rt_init_dst. Flags
>> argument is passed to ip6_dst_alloc which is always invoked before
>> ip6_rt_copy_init is called which is the only caller of ip6_rt_init_dst.
> 
> ip6_rt_cache_alloc calls ip6_dst_alloc with zero as flags; and only
> one flag is copied later (DST_HOST) outside of ip6_rt_init_dst().
> If the flag assignment is completely removed from ip6_rt_init_dst(),
> then DST_NOPOLICY flag will be lost.
> 
> Which may be OK, but is more than what this patch tries to solve (do not
> copy DST_NOCOUNT flag).

After 5+ days mostly offline I just started looking at this problem.
Give me some time to chase down a thought I had from my last response.


Re: [PATCH net] net/ipv6: do not copy DST_NOCOUNT flag on rt init

2018-09-17 Thread Peter Oskolkov
On Thu, Sep 13, 2018 at 9:11 PM David Ahern  wrote:
>
> On 9/13/18 1:38 PM, Peter Oskolkov wrote:
>
> > diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> > index 3eed045c65a5..a3902f805305 100644
> > --- a/net/ipv6/route.c
> > +++ b/net/ipv6/route.c
> > @@ -946,7 +946,7 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, 
> > struct fib6_info *ort)
> >
> >  static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
> >  {
> > - rt->dst.flags |= fib6_info_dst_flags(ort);
> > + rt->dst.flags |= fib6_info_dst_flags(ort) & ~DST_NOCOUNT;
>
> I think my mistake is setting dst.flags in ip6_rt_init_dst. Flags
> argument is passed to ip6_dst_alloc which is always invoked before
> ip6_rt_copy_init is called which is the only caller of ip6_rt_init_dst.

ip6_rt_cache_alloc calls ip6_dst_alloc with zero as flags; and only
one flag is copied later (DST_HOST) outside of ip6_rt_init_dst().
If the flag assignment is completely removed from ip6_rt_init_dst(),
then DST_NOPOLICY flag will be lost.

Which may be OK, but is more than what this patch tries to solve (do not
copy DST_NOCOUNT flag).

>
> >
> >   if (ort->fib6_flags & RTF_REJECT) {
> >   ip6_rt_init_dst_reject(rt, ort);
> >
>


Re: [PATCH net-next 00/15] s390/qeth: updates 2018-09-17

2018-09-17 Thread David Miller
From: Julian Wiedmann 
Date: Mon, 17 Sep 2018 17:35:54 +0200

> please apply the following patchset to net-next. This brings more 
> restructuring
> of qeth's transmit code (eliminating its last usage of 
> skb_realloc_headroom()),
> and the usual mix of minor improvements & cleanups.

Series applied, thank you.


Re: What is the best forum (mailing list, irc etc) to ask questions about the usage of AF_XDP sockets.

2018-09-17 Thread Konrad Djimeli
On 2018-09-13 18:52, Jakub Kicinski wrote:
> On Thu, 13 Sep 2018 18:31:55 +0200, Konrad Djimeli wrote:
>> Hello,
>>
>> I have been working on trying to make use of AF_XDP sockets as part of a
>> project I working on, and I have been facing some issues but I am not
>> sure where to ask questions related to the usage of AF_XDP, since this
>> is a development mailing list.
> 
> IMHO AF_XDP is quite fresh so it should be okay to ask questions on
> netdev.  There is also xdp-newbies mailing list which seems very
> appropriate for less advanced questions!

Thanks a lot for the suggestions. Though I am still very new to the
technology, I really enjoy working with AF_XDP and I hope to share my
experience the and queries with the community. Below is link to what I
have been working on, so far.
-
https://github.com/djkonro/snabb/commit/74310142882a5f09487c676f61f75b3bd686783c

Thanks
Konrad
www.djimeli.me


Re: [PATCH] net: phy: phylink: fix SFP interface autodetection

2018-09-17 Thread Baruch Siach
Hi Russell,

Russell King - ARM Linux writes:

> On Mon, Sep 17, 2018 at 05:19:57PM +0300, Baruch Siach wrote:
>> When the switching to the SFP detected link mode update the main
>> link_interface field as well. Otherwise, the link fails to come up when
>> the configured 'phy-mode' defers from the SFP detected mode.
>> 
>> This fixes 1GB SFP module link up on eth3 of the Macchiatobin board that
>> is configured in the DT to "2500base-x" phy-mode.
>
> link_interface isn't supposed to track the SFP link mode.  In any case,
> this is only used when a PHY is attached.  For a PHY on a SFP,
> phylink_connect_phy() should be using link_config.interface and not
> link_interface there.

You mean something like this?

@@ -750,7 +750,7 @@ int phylink_connect_phy(struct phylink *pl, struct 
phy_device *phy)
pl->link_config.interface = pl->link_interface;
}
 
-   ret = phy_attach_direct(pl->netdev, phy, 0, pl->link_interface);
+   ret = phy_attach_direct(pl->netdev, phy, 0, pl->link_config.interface);
if (ret)
return ret;
 
This fixes a similar issue on the Armada 8040 based Clearfog GT-8K where
the SFP interface phy-mode is set to "10gbase-kr". But the Macchiatobin
PHY-less interface has phy-mode set to "2500base-x", so I hit a separate
problem:

[   49.599455] WARNING: CPU: 1 PID: 1223 at drivers/net/phy/phylink.c:741 
phylink_connect_phy+0x34/0xb4
[   49.608629] CPU: 1 PID: 1223 Comm: ifconfig Not tainted 
4.19.0-rc4-6-g18cf7eb4b625-dirty #952
[   49.617537] Hardware name: Marvell 8040 MACCHIATOBin (DT)
[   49.622957] pstate: 6005 (nZCv daif -PAN -UAO)
[   49.627767] pc : phylink_connect_phy+0x34/0xb4
[   49.632229] lr : phylink_sfp_connect_phy+0xc/0x14
[   49.636950] sp : 0b273830
[   49.640276] x29: 0b273830 x28: 80007a342df0 
[   49.645610] x27: 04b0 x26: 0004 
[   49.650945] x25: 80007a342940 x24: 80007b90cc18 
[   49.656278] x23: 8000799b2538 x22: 08bda000 
[   49.661613] x21: 80007a342000 x20: 80007bca4000 
[   49.666947] x19: 80007b814a80 x18: 000a 
[   49.672282] x17:  x16: 0004 
[   49.677616] x15: 0416 x14: 0400 
[   49.682951] x13: 0400 x12:  
[   49.688284] x11:  x10:  
[   49.693618] x9 : 02c6 x8 :  
[   49.698952] x7 : 80007ff88b40 x6 : 0001 
[   49.704286] x5 : 0003 x4 : 80007bca4048 
[   49.709621] x3 : 0004 x2 : 0001 
[   49.714955] x1 : 80007bca4000 x0 : 80007a337400 
[   49.720289] Call trace:
[   49.722745]  phylink_connect_phy+0x34/0xb4
[   49.726857]  phylink_sfp_connect_phy+0xc/0x14
[   49.731231]  sfp_add_phy+0x48/0x50
[   49.734647]  sfp_sm_event+0x6e0/0x86c
[   49.738323]  sfp_start+0x10/0x18
[   49.741563]  sfp_upstream_start+0x28/0x3c
[   49.745588]  phylink_start+0x120/0x164
[   49.749352]  mvpp2_start_dev+0x184/0x260
[   49.753290]  mvpp2_open+0x394/0x3e8
[   49.756793]  __dev_open+0x110/0x124
[   49.760294]  __dev_change_flags+0xe8/0x190
[   49.764406]  dev_change_flags+0x20/0x5c
[   49.768258]  devinet_ioctl+0x270/0x528
[   49.772020]  inet_ioctl+0x13c/0x2e4
[   49.775524]  sock_do_ioctl+0x4c/0x1f4
[   49.779200]  sock_ioctl+0xf4/0x360
[   49.782616]  vfs_ioctl+0x24/0x40
[   49.785855]  do_vfs_ioctl+0xa4/0x7d4
[   49.789443]  ksys_ioctl+0x44/0x74
[   49.792770]  __arm64_sys_ioctl+0x18/0x24
[   49.796708]  el0_svc_common+0x8c/0xd0
[   49.800384]  el0_svc_handler+0x3c/0x6c
[   49.804148]  el0_svc+0x8/0xc
[   49.807039] ---[ end trace 0d1e0898561fce6e ]---
[   49.811729] sfp sfp-eth3: sfp_add_phy failed: -22

This hunk fixes that one:

@@ -738,7 +738,7 @@ int phylink_connect_phy(struct phylink *pl, struct 
phy_device *phy)
 
if (WARN_ON(pl->link_an_mode == MLO_AN_FIXED ||
(pl->link_an_mode == MLO_AN_INBAND &&
-phy_interface_mode_is_8023z(pl->link_interface
+phy_interface_mode_is_8023z(pl->link_config.interface
return -EINVAL;
 
if (pl->phydev)

Is that the correct fix?

Thanks,
baruch

-- 
 http://baruch.siach.name/blog/  ~. .~   Tk Open Systems
=}ooO--U--Ooo{=
   - bar...@tkos.co.il - tel: +972.52.368.4656, http://www.tkos.co.il -


Re: [PATCH v3 net-next 07/12] net: ethernet: Add helper to remove a supported link mode

2018-09-17 Thread Andrew Lunn
On Mon, Sep 17, 2018 at 05:13:07PM +0200, Simon Horman wrote:
> On Wed, Sep 12, 2018 at 01:53:14AM +0200, Andrew Lunn wrote:
> > Some MAC hardware cannot support a subset of link modes. e.g. often
> > 1Gbps Full duplex is supported, but Half duplex is not. Add a helper
> > to remove such a link mode.
> > 
> > Signed-off-by: Andrew Lunn 
> > Reviewed-by: Florian Fainelli 
> > ---
> >  drivers/net/ethernet/apm/xgene/xgene_enet_hw.c |  6 +++---
> >  drivers/net/ethernet/cadence/macb_main.c   |  5 ++---
> >  drivers/net/ethernet/freescale/fec_main.c  |  3 ++-
> >  drivers/net/ethernet/microchip/lan743x_main.c  |  2 +-
> >  drivers/net/ethernet/renesas/ravb_main.c   |  3 ++-
> >  .../net/ethernet/stmicro/stmmac/stmmac_main.c  | 12 
> >  drivers/net/phy/phy_device.c   | 18 ++
> >  drivers/net/usb/lan78xx.c  |  2 +-
> >  include/linux/phy.h|  1 +
> >  9 files changed, 38 insertions(+), 14 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c 
> > b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
> > index 078a04dc1182..4831f9de5945 100644
> 
> ...
> 
> > diff --git a/drivers/net/ethernet/renesas/ravb_main.c 
> > b/drivers/net/ethernet/renesas/ravb_main.c
> > index aff5516b781e..fb2a1125780d 100644
> > --- a/drivers/net/ethernet/renesas/ravb_main.c
> > +++ b/drivers/net/ethernet/renesas/ravb_main.c
> > @@ -1074,7 +1074,8 @@ static int ravb_phy_init(struct net_device *ndev)
> > }
> >  
> > /* 10BASE is not supported */
> > -   phydev->supported &= ~PHY_10BT_FEATURES;
> > +   phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_10baseT_Half_BIT);
> > +   phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_10baseT_Full_BIT);
> >  
> > phy_attached_info(phydev);
> >  
> 
> ...
> 
> > diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> > index db1172db1e7c..e9ca83a438b0 100644
> > --- a/drivers/net/phy/phy_device.c
> > +++ b/drivers/net/phy/phy_device.c
> > @@ -1765,6 +1765,24 @@ int phy_set_max_speed(struct phy_device *phydev, u32 
> > max_speed)
> >  }
> >  EXPORT_SYMBOL(phy_set_max_speed);
> >  
> > +/**
> > + * phy_remove_link_mode - Remove a supported link mode
> > + * @phydev: phy_device structure to remove link mode from
> > + * @link_mode: Link mode to be removed
> > + *
> > + * Description: Some MACs don't support all link modes which the PHY
> > + * does.  e.g. a 1G MAC often does not support 1000Half. Add a helper
> > + * to remove a link mode.
> > + */
> > +void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode)
> > +{
> > +   WARN_ON(link_mode > 31);
> > +
> > +   phydev->supported &= ~BIT(link_mode);
> > +   phydev->advertising = phydev->supported;
> > +}
> > +EXPORT_SYMBOL(phy_remove_link_mode);
> > +
> >  static void of_set_phy_supported(struct phy_device *phydev)
> >  {
> > struct device_node *node = phydev->mdio.dev.of_node;
> 
> Hi Andrew,
> 
> I believe that for the RAVB the overall effect of this change is that
> 10-BaseT modes are no longer advertised (although both with and without
> this patch they are not supported).
> 
> Unfortunately on R-Car Gen3 M3-W (r8a7796) based Salvator-X board
> I have observed that this results in the link no longer being negotiated
> on one switch (the one I usually use) while it seemed fine on another.

Hi Simon

Thanks for testing this.

Could you dump the PHY registers with and without this patch:

$ mii-tool -vv eth0

Once difference is that phy_remove_link_mode() does
phydev->advertising = phydev->supported where as the old code does
not. I though phylib would do this anyway, it does at some point in
time, but i didn't check when. It could be you are actually
advertising 10, even if you don't support it.

Thanks
Andrew


[PATCH net-next 07/15] s390/qeth: check size of required HW header cache object

2018-09-17 Thread Julian Wiedmann
When qeth_add_hw_header() falls back to the header cache, ensure that
the requested length doesn't exceed the object size.

For current usage this is a no-brainer, but TSO transmission will
introduce protocol headers of varying length.

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_core_main.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/s390/net/qeth_core_main.c 
b/drivers/s390/net/qeth_core_main.c
index eaf01dc62e91..79ebe8a5687b 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -3844,6 +3845,8 @@ int qeth_hdr_chk_and_bounce(struct sk_buff *skb, struct 
qeth_hdr **hdr, int len)
 }
 EXPORT_SYMBOL_GPL(qeth_hdr_chk_and_bounce);
 
+#define QETH_HDR_CACHE_OBJ_SIZE(sizeof(struct qeth_hdr) + 
ETH_HLEN)
+
 /**
  * qeth_add_hw_header() - add a HW header to an skb.
  * @skb: skb that the HW header should be added to.
@@ -3918,6 +3921,8 @@ int qeth_add_hw_header(struct qeth_card *card, struct 
sk_buff *skb,
return hdr_len;
}
/* fall back */
+   if (hdr_len + proto_len > QETH_HDR_CACHE_OBJ_SIZE)
+   return -E2BIG;
*hdr = kmem_cache_alloc(qeth_core_header_cache, GFP_ATOMIC);
if (!*hdr)
return -ENOMEM;
@@ -6661,8 +,10 @@ static int __init qeth_core_init(void)
rc = PTR_ERR_OR_ZERO(qeth_core_root_dev);
if (rc)
goto register_err;
-   qeth_core_header_cache = kmem_cache_create("qeth_hdr",
-   sizeof(struct qeth_hdr) + ETH_HLEN, 64, 0, NULL);
+   qeth_core_header_cache =
+   kmem_cache_create("qeth_hdr", QETH_HDR_CACHE_OBJ_SIZE,
+ roundup_pow_of_two(QETH_HDR_CACHE_OBJ_SIZE),
+ 0, NULL);
if (!qeth_core_header_cache) {
rc = -ENOMEM;
goto slab_err;
-- 
2.16.4



[PATCH net-next 01/15] s390/qeth: move L2 xmit code to core module

2018-09-17 Thread Julian Wiedmann
We need the exact same transmit path for non-offload-eligible traffic on
L3 OSAs. So make it accessible from both sub-drivers.

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_core.h  |  5 +++
 drivers/s390/net/qeth_core_main.c | 59 +++
 drivers/s390/net/qeth_l2_main.c   | 74 ++-
 3 files changed, 75 insertions(+), 63 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 34e0d476c5c6..2110fabdcc7a 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -1052,6 +1052,11 @@ int qeth_vm_request_mac(struct qeth_card *card);
 int qeth_add_hw_header(struct qeth_card *card, struct sk_buff *skb,
   struct qeth_hdr **hdr, unsigned int hdr_len,
   unsigned int proto_len, unsigned int *elements);
+int qeth_xmit(struct qeth_card *card, struct sk_buff *skb,
+ struct qeth_qdio_out_q *queue, int ipv, int cast_type,
+ void (*fill_header)(struct qeth_card *card, struct qeth_hdr *hdr,
+ struct sk_buff *skb, int ipv, int cast_type,
+ unsigned int data_len));
 
 /* exports for OSN */
 int qeth_osn_assist(struct net_device *, void *, int);
diff --git a/drivers/s390/net/qeth_core_main.c 
b/drivers/s390/net/qeth_core_main.c
index de8282420f96..d2ca33a9330a 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -4176,6 +4176,65 @@ int qeth_do_send_packet(struct qeth_card *card, struct 
qeth_qdio_out_q *queue,
 }
 EXPORT_SYMBOL_GPL(qeth_do_send_packet);
 
+int qeth_xmit(struct qeth_card *card, struct sk_buff *skb,
+ struct qeth_qdio_out_q *queue, int ipv, int cast_type,
+ void (*fill_header)(struct qeth_card *card, struct qeth_hdr *hdr,
+ struct sk_buff *skb, int ipv, int cast_type,
+ unsigned int data_len))
+{
+   const unsigned int proto_len = IS_IQD(card) ? ETH_HLEN : 0;
+   const unsigned int hw_hdr_len = sizeof(struct qeth_hdr);
+   unsigned int frame_len = skb->len;
+   unsigned int data_offset = 0;
+   struct qeth_hdr *hdr = NULL;
+   unsigned int hd_len = 0;
+   unsigned int elements;
+   int push_len, rc;
+   bool is_sg;
+
+   rc = skb_cow_head(skb, hw_hdr_len);
+   if (rc)
+   return rc;
+
+   push_len = qeth_add_hw_header(card, skb, , hw_hdr_len, proto_len,
+ );
+   if (push_len < 0)
+   return push_len;
+   if (!push_len) {
+   /* HW header needs its own buffer element. */
+   hd_len = hw_hdr_len + proto_len;
+   data_offset = proto_len;
+   }
+   fill_header(card, hdr, skb, ipv, cast_type, frame_len);
+
+   is_sg = skb_is_nonlinear(skb);
+   if (IS_IQD(card)) {
+   rc = qeth_do_send_packet_fast(queue, skb, hdr, data_offset,
+ hd_len);
+   } else {
+   /* TODO: drop skb_orphan() once TX completion is fast enough */
+   skb_orphan(skb);
+   rc = qeth_do_send_packet(card, queue, skb, hdr, data_offset,
+hd_len, elements);
+   }
+
+   if (!rc) {
+   if (card->options.performance_stats) {
+   card->perf_stats.buf_elements_sent += elements;
+   if (is_sg)
+   card->perf_stats.sg_skbs_sent++;
+   }
+   } else {
+   if (!push_len)
+   kmem_cache_free(qeth_core_header_cache, hdr);
+   if (rc == -EBUSY)
+   /* roll back to ETH header */
+   skb_pull(skb, push_len);
+   }
+   return rc;
+}
+EXPORT_SYMBOL_GPL(qeth_xmit);
+
 static int qeth_setadp_promisc_mode_cb(struct qeth_card *card,
struct qeth_reply *reply, unsigned long data)
 {
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index b5e38531733f..715d58af5fc4 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -193,8 +193,9 @@ static int qeth_l2_get_cast_type(struct qeth_card *card, 
struct sk_buff *skb)
return RTN_UNICAST;
 }
 
-static void qeth_l2_fill_header(struct qeth_hdr *hdr, struct sk_buff *skb,
-   int cast_type, unsigned int data_len)
+static void qeth_l2_fill_header(struct qeth_card *card, struct qeth_hdr *hdr,
+   struct sk_buff *skb, int ipv, int cast_type,
+   unsigned int data_len)
 {
struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb_mac_header(skb);
 
@@ -202,6 +203,12 @@ static void qeth_l2_fill_header(struct qeth_hdr *hdr, 
struct sk_buff *skb,
hdr->hdr.l2.id = QETH_HEADER_TYPE_LAYER2;

Re: [PATCH iproute2] libnetlink: fix leak and using unused memory on error

2018-09-17 Thread Stephen Hemminger
On Thu, 13 Sep 2018 12:33:38 -0700
Stephen Hemminger  wrote:

> If an error happens in multi-segment message (tc only)
> then report the error and stop processing further responses.
> This also fixes refering to the buffer after free.
> 
> The sequence check is not necessary here because the
> response message has already been validated to be in
> the window of the sequence number of the iov.
> 
> Reported-by: Mahesh Bandewar 
> Fixes: 7b2ee50c0cd5 ("hv_netvsc: common detach logic")
   ^
Bad habit of working on two projects as once.


[PATCH net-next 08/15] s390/qeth: prepare for copy-free TSO transmission

2018-09-17 Thread Julian Wiedmann
Add all the necessary TSO plumbing to the copy-less transmit path.
This includes calculating the right length of required protocol headers,
and always building a separate buffer element for the TSO headers.

A follow-up patch will then switch TSO traffic over to this path.

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_core.h  |  1 +
 drivers/s390/net/qeth_core_main.c | 10 -
 drivers/s390/net/qeth_l2_main.c   |  1 -
 drivers/s390/net/qeth_l3_main.c   | 93 ---
 4 files changed, 77 insertions(+), 28 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index b47fb95a49e9..d86eea9db2a7 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
diff --git a/drivers/s390/net/qeth_core_main.c 
b/drivers/s390/net/qeth_core_main.c
index 79ebe8a5687b..460ffdf1b200 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -3845,7 +3845,8 @@ int qeth_hdr_chk_and_bounce(struct sk_buff *skb, struct 
qeth_hdr **hdr, int len)
 }
 EXPORT_SYMBOL_GPL(qeth_hdr_chk_and_bounce);
 
-#define QETH_HDR_CACHE_OBJ_SIZE(sizeof(struct qeth_hdr) + 
ETH_HLEN)
+#define QETH_HDR_CACHE_OBJ_SIZE(sizeof(struct qeth_hdr_tso) + \
+MAX_TCP_HEADER)
 
 /**
  * qeth_add_hw_header() - add a HW header to an skb.
@@ -3880,7 +3881,11 @@ int qeth_add_hw_header(struct qeth_card *card, struct 
sk_buff *skb,
if (qeth_get_elements_for_range(start, end + contiguous) == 1) {
/* Push HW header into same page as first protocol header. */
push_ok = true;
-   __elements = qeth_count_elements(skb, 0);
+   /* ... but TSO always needs a separate element for headers: */
+   if (skb_is_gso(skb))
+   __elements = 1 + qeth_count_elements(skb, proto_len);
+   else
+   __elements = qeth_count_elements(skb, 0);
} else if (!proto_len && qeth_get_elements_for_range(start, end) == 1) {
/* Push HW header into a new page. */
push_ok = true;
@@ -4193,6 +4198,7 @@ int qeth_xmit(struct qeth_card *card, struct sk_buff *skb,
hd_len = hw_hdr_len + proto_len;
data_offset = proto_len;
}
+   memset(hdr, 0, hw_hdr_len);
fill_header(card, hdr, skb, ipv, cast_type, frame_len);
 
is_sg = skb_is_nonlinear(skb);
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 87cb71d5dae8..24b531ca2827 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -199,7 +199,6 @@ static void qeth_l2_fill_header(struct qeth_card *card, 
struct qeth_hdr *hdr,
 {
struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb_mac_header(skb);
 
-   memset(hdr, 0, sizeof(struct qeth_hdr));
hdr->hdr.l2.id = QETH_HEADER_TYPE_LAYER2;
hdr->hdr.l2.pkt_length = data_len;
 
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 0c085f57f7e2..fe55218802d7 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2008,7 +2008,6 @@ static void qeth_l3_fill_af_iucv_hdr(struct qeth_hdr 
*hdr, struct sk_buff *skb,
char daddr[16];
struct af_iucv_trans_hdr *iucv_hdr;
 
-   memset(hdr, 0, sizeof(struct qeth_hdr));
hdr->hdr.l3.id = QETH_HEADER_TYPE_LAYER3;
hdr->hdr.l3.length = data_len;
hdr->hdr.l3.flags = QETH_HDR_IPV6 | QETH_CAST_UNICAST;
@@ -2038,10 +2037,22 @@ static void qeth_l3_fill_header(struct qeth_card *card, 
struct qeth_hdr *hdr,
 {
struct vlan_ethhdr *veth = vlan_eth_hdr(skb);
 
-   memset(hdr, 0, sizeof(struct qeth_hdr));
-   hdr->hdr.l3.id = QETH_HEADER_TYPE_LAYER3;
hdr->hdr.l3.length = data_len;
 
+   if (skb_is_gso(skb)) {
+   hdr->hdr.l3.id = QETH_HEADER_TYPE_TSO;
+   } else {
+   hdr->hdr.l3.id = QETH_HEADER_TYPE_LAYER3;
+   if (skb->ip_summed == CHECKSUM_PARTIAL) {
+   qeth_tx_csum(skb, >hdr.l3.ext_flags, ipv);
+   /* some HW requires combined L3+L4 csum offload: */
+   if (ipv == 4)
+   hdr->hdr.l3.ext_flags |= 
QETH_HDR_EXT_CSUM_HDR_REQ;
+   if (card->options.performance_stats)
+   card->perf_stats.tx_csum++;
+   }
+   }
+
if (ipv == 4 || IS_IQD(card)) {
/* NETIF_F_HW_VLAN_CTAG_TX */
if (skb_vlan_tag_present(skb)) {
@@ -2053,15 +2064,6 @@ static void qeth_l3_fill_header(struct qeth_card *card, 
struct qeth_hdr *hdr,
hdr->hdr.l3.vlan_id = ntohs(veth->h_vlan_TCI);
}
 
-   if (!skb_is_gso(skb) && skb->ip_summed == CHECKSUM_PARTIAL) {

[PATCH net-next 13/15] s390/qeth: fix typo in return value

2018-09-17 Thread Julian Wiedmann
Assuming this was just a typo, as returning an actual negative value
from a cmd callback would make no sense either.

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_core_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/s390/net/qeth_core_main.c 
b/drivers/s390/net/qeth_core_main.c
index b76e844811e1..f09bef4a49ca 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -3058,7 +3058,7 @@ static int qeth_query_ipassists_cb(struct qeth_card *card,
QETH_DBF_TEXT(SETUP, 2, "ipaunsup");
card->options.ipa4.supported_funcs |= IPA_SETADAPTERPARMS;
card->options.ipa6.supported_funcs |= IPA_SETADAPTERPARMS;
-   return -0;
+   return 0;
default:
if (cmd->hdr.return_code) {
QETH_DBF_MESSAGE(1, "%s IPA_CMD_QIPASSIST: Unhandled "
-- 
2.16.4



[PATCH net-next 04/15] s390/qeth: remove qeth_get_elements_no()

2018-09-17 Thread Julian Wiedmann
Convert the last remaining user of qeth_get_elements_no() to
qeth_count_elements(), so this helper can be removed.

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_core.h  |  3 +--
 drivers/s390/net/qeth_core_main.c | 39 +++
 drivers/s390/net/qeth_l2_main.c   |  4 ++--
 drivers/s390/net/qeth_l3_main.c   |  1 -
 4 files changed, 14 insertions(+), 33 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 2110fabdcc7a..0857b1286660 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -1007,8 +1007,7 @@ int qeth_query_switch_attributes(struct qeth_card *card,
 int qeth_send_control_data(struct qeth_card *, int, struct qeth_cmd_buffer *,
int (*reply_cb)(struct qeth_card *, struct qeth_reply*, unsigned long),
void *reply_param);
-int qeth_get_elements_no(struct qeth_card *card, struct sk_buff *skb,
-int extra_elems, int data_offset);
+unsigned int qeth_count_elements(struct sk_buff *skb, unsigned int 
data_offset);
 int qeth_get_elements_for_frags(struct sk_buff *);
 int qeth_do_send_packet_fast(struct qeth_qdio_out_q *queue, struct sk_buff 
*skb,
 struct qeth_hdr *hdr, unsigned int offset,
diff --git a/drivers/s390/net/qeth_core_main.c 
b/drivers/s390/net/qeth_core_main.c
index d2ca33a9330a..eaf01dc62e91 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -3802,7 +3802,16 @@ int qeth_get_elements_for_frags(struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(qeth_get_elements_for_frags);
 
-static unsigned int qeth_count_elements(struct sk_buff *skb, int data_offset)
+/**
+ * qeth_count_elements() - Counts the number of QDIO buffer elements needed
+ * to transmit an skb.
+ * @skb:   the skb to operate on.
+ * @data_offset:   skip this part of the skb's linear data
+ *
+ * Returns the number of pages, and thus QDIO buffer elements, needed to map 
the
+ * skb's data (both its linear part and paged fragments).
+ */
+unsigned int qeth_count_elements(struct sk_buff *skb, unsigned int data_offset)
 {
unsigned int elements = qeth_get_elements_for_frags(skb);
addr_t end = (addr_t)skb->data + skb_headlen(skb);
@@ -3812,33 +3821,7 @@ static unsigned int qeth_count_elements(struct sk_buff 
*skb, int data_offset)
elements += qeth_get_elements_for_range(start, end);
return elements;
 }
-
-/**
- * qeth_get_elements_no() -find number of SBALEs for skb data, inc. frags.
- * @card:  qeth card structure, to check max. elems.
- * @skb:   SKB address
- * @extra_elems:   extra elems needed, to check against max.
- * @data_offset:   range starts at skb->data + data_offset
- *
- * Returns the number of pages, and thus QDIO buffer elements, needed to cover
- * skb data, including linear part and fragments. Checks if the result plus
- * extra_elems fits under the limit for the card. Returns 0 if it does not.
- * Note: extra_elems is not included in the returned result.
- */
-int qeth_get_elements_no(struct qeth_card *card,
-struct sk_buff *skb, int extra_elems, int data_offset)
-{
-   int elements = qeth_count_elements(skb, data_offset);
-
-   if ((elements + extra_elems) > QETH_MAX_BUFFER_ELEMENTS(card)) {
-   QETH_DBF_MESSAGE(2, "Invalid size of IP packet "
-   "(Number=%d / Length=%d). Discarded.\n",
-   elements + extra_elems, skb->len);
-   return 0;
-   }
-   return elements;
-}
-EXPORT_SYMBOL_GPL(qeth_get_elements_no);
+EXPORT_SYMBOL_GPL(qeth_count_elements);
 
 int qeth_hdr_chk_and_bounce(struct sk_buff *skb, struct qeth_hdr **hdr, int 
len)
 {
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 715d58af5fc4..87cb71d5dae8 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -658,8 +658,8 @@ static int qeth_l2_xmit_osn(struct qeth_card *card, struct 
sk_buff *skb,
return -EPROTONOSUPPORT;
 
hdr = (struct qeth_hdr *)skb->data;
-   elements = qeth_get_elements_no(card, skb, 0, 0);
-   if (!elements)
+   elements = qeth_count_elements(skb, 0);
+   if (elements > QETH_MAX_BUFFER_ELEMENTS(card))
return -E2BIG;
if (qeth_hdr_chk_and_bounce(skb, , sizeof(*hdr)))
return -EINVAL;
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 00e6e7471f5d..1d92584e01b3 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2140,7 +2140,6 @@ static void qeth_tso_fill_header(struct qeth_card *card,
  *
  * Returns the number of pages, and thus QDIO buffer elements, needed to cover
  * skb data, including linear part and fragments, but excluding TCP header.
- * 

[PATCH net-next 12/15] s390/qeth: invoke softirqs after napi_schedule()

2018-09-17 Thread Julian Wiedmann
Calling napi_schedule() from process context does not ensure that the
NET_RX softirq is run in a timely fashion. So trigger it manually.

This is no big issue with current code. A call to ndo_open() is usually
followed by a ndo_set_rx_mode() call, and for qeth this contains a
spin_unlock_bh(). Except for OSN, where qeth_l2_set_rx_mode() bails out
early.
Nevertheless it's best to not depend on this behaviour, and just fix
the issue at its source like all other drivers do. For instance see
commit 83a0c6e58901 ("i40e: Invoke softirqs after napi_reschedule").

Fixes: a1c3ed4c9ca0 ("qeth: NAPI support for l2 and l3 discipline")
Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_l2_main.c | 3 +++
 drivers/s390/net/qeth_l3_main.c | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 33b65471a68a..6285af373bdf 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -755,7 +755,10 @@ static int __qeth_l2_open(struct net_device *dev)
 
if (qdio_stop_irq(card->data.ccwdev, 0) >= 0) {
napi_enable(>napi);
+   local_bh_disable();
napi_schedule(>napi);
+   /* kick-start the NAPI softirq: */
+   local_bh_enable();
} else
rc = -EIO;
return rc;
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 5d7e2921ab36..8930d2a9fcad 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2293,7 +2293,10 @@ static int __qeth_l3_open(struct net_device *dev)
 
if (qdio_stop_irq(card->data.ccwdev, 0) >= 0) {
napi_enable(>napi);
+   local_bh_disable();
napi_schedule(>napi);
+   /* kick-start the NAPI softirq: */
+   local_bh_enable();
} else
rc = -EIO;
return rc;
-- 
2.16.4



[PATCH net-next 00/15] s390/qeth: updates 2018-09-17

2018-09-17 Thread Julian Wiedmann
Hi Dave,

please apply the following patchset to net-next. This brings more restructuring
of qeth's transmit code (eliminating its last usage of skb_realloc_headroom()),
and the usual mix of minor improvements & cleanups.

Thanks,
Julian


Julian Wiedmann (15):
  s390/qeth: move L2 xmit code to core module
  s390/qeth: run non-offload L3 traffic over common xmit path
  s390/qeth: remove unused L3 xmit code
  s390/qeth: remove qeth_get_elements_no()
  s390/qeth: limit csum offload erratum to L3 devices
  s390/qeth: fix up protocol headers early
  s390/qeth: check size of required HW header cache object
  s390/qeth: prepare for copy-free TSO transmission
  s390/qeth: speed up TSO transmission
  s390/qeth: remove qeth_hdr_chk_and_bounce()
  s390/qeth: uninstall IRQ handler on device removal
  s390/qeth: invoke softirqs after napi_schedule()
  s390/qeth: fix typo in return value
  s390/qeth: fine-tune spinlocks
  s390/qeth: reduce 0-initializing when building IPA cmds

 drivers/s390/net/qeth_core.h  |  17 +-
 drivers/s390/net/qeth_core_main.c | 307 ++
 drivers/s390/net/qeth_l2_main.c   | 116 +
 drivers/s390/net/qeth_l3_main.c   | 340 +-
 4 files changed, 334 insertions(+), 446 deletions(-)

-- 
2.16.4



[PATCH net-next 11/15] s390/qeth: uninstall IRQ handler on device removal

2018-09-17 Thread Julian Wiedmann
When setting up, qeth installs its IRQ handler on the ccw devices. But
the IRQ handler is not cleared on removal - so even after qeth yields
control of the ccw devices, spurious interrupts would still be presented
to us.

Make (de-)installation of the IRQ handler part of the ccw channel
setup/removal helpers, and while at it also add the appropriate locking.
Shift around qeth_setup_channel() to avoid a forward declaration for
qeth_irq().

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_core_main.c | 102 --
 1 file changed, 54 insertions(+), 48 deletions(-)

diff --git a/drivers/s390/net/qeth_core_main.c 
b/drivers/s390/net/qeth_core_main.c
index c7f7061a7205..b76e844811e1 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -901,44 +901,6 @@ static void qeth_send_control_data_cb(struct qeth_channel 
*channel,
qeth_release_buffer(channel, iob);
 }
 
-static int qeth_setup_channel(struct qeth_channel *channel, bool alloc_buffers)
-{
-   int cnt;
-
-   QETH_DBF_TEXT(SETUP, 2, "setupch");
-
-   channel->ccw = kmalloc(sizeof(struct ccw1), GFP_KERNEL | GFP_DMA);
-   if (!channel->ccw)
-   return -ENOMEM;
-   channel->state = CH_STATE_DOWN;
-   atomic_set(>irq_pending, 0);
-   init_waitqueue_head(>wait_q);
-
-   if (!alloc_buffers)
-   return 0;
-
-   for (cnt = 0; cnt < QETH_CMD_BUFFER_NO; cnt++) {
-   channel->iob[cnt].data =
-   kzalloc(QETH_BUFSIZE, GFP_DMA|GFP_KERNEL);
-   if (channel->iob[cnt].data == NULL)
-   break;
-   channel->iob[cnt].state = BUF_STATE_FREE;
-   channel->iob[cnt].channel = channel;
-   channel->iob[cnt].callback = qeth_send_control_data_cb;
-   channel->iob[cnt].rc = 0;
-   }
-   if (cnt < QETH_CMD_BUFFER_NO) {
-   kfree(channel->ccw);
-   while (cnt-- > 0)
-   kfree(channel->iob[cnt].data);
-   return -ENOMEM;
-   }
-   channel->io_buf_no = 0;
-   spin_lock_init(>iob_lock);
-
-   return 0;
-}
-
 static int qeth_set_thread_start_bit(struct qeth_card *card,
unsigned long thread)
 {
@@ -1337,14 +1299,61 @@ static void qeth_free_buffer_pool(struct qeth_card 
*card)
 
 static void qeth_clean_channel(struct qeth_channel *channel)
 {
+   struct ccw_device *cdev = channel->ccwdev;
int cnt;
 
QETH_DBF_TEXT(SETUP, 2, "freech");
+
+   spin_lock_irq(get_ccwdev_lock(cdev));
+   cdev->handler = NULL;
+   spin_unlock_irq(get_ccwdev_lock(cdev));
+
for (cnt = 0; cnt < QETH_CMD_BUFFER_NO; cnt++)
kfree(channel->iob[cnt].data);
kfree(channel->ccw);
 }
 
+static int qeth_setup_channel(struct qeth_channel *channel, bool alloc_buffers)
+{
+   struct ccw_device *cdev = channel->ccwdev;
+   int cnt;
+
+   QETH_DBF_TEXT(SETUP, 2, "setupch");
+
+   channel->ccw = kmalloc(sizeof(struct ccw1), GFP_KERNEL | GFP_DMA);
+   if (!channel->ccw)
+   return -ENOMEM;
+   channel->state = CH_STATE_DOWN;
+   atomic_set(>irq_pending, 0);
+   init_waitqueue_head(>wait_q);
+
+   spin_lock_irq(get_ccwdev_lock(cdev));
+   cdev->handler = qeth_irq;
+   spin_unlock_irq(get_ccwdev_lock(cdev));
+
+   if (!alloc_buffers)
+   return 0;
+
+   for (cnt = 0; cnt < QETH_CMD_BUFFER_NO; cnt++) {
+   channel->iob[cnt].data =
+   kzalloc(QETH_BUFSIZE, GFP_DMA|GFP_KERNEL);
+   if (channel->iob[cnt].data == NULL)
+   break;
+   channel->iob[cnt].state = BUF_STATE_FREE;
+   channel->iob[cnt].channel = channel;
+   channel->iob[cnt].callback = qeth_send_control_data_cb;
+   channel->iob[cnt].rc = 0;
+   }
+   if (cnt < QETH_CMD_BUFFER_NO) {
+   qeth_clean_channel(channel);
+   return -ENOMEM;
+   }
+   channel->io_buf_no = 0;
+   spin_lock_init(>iob_lock);
+
+   return 0;
+}
+
 static void qeth_set_single_write_queues(struct qeth_card *card)
 {
if ((atomic_read(>qdio.state) != QETH_QDIO_UNINITIALIZED) &&
@@ -1495,7 +1504,7 @@ static void qeth_core_sl_print(struct seq_file *m, struct 
service_level *slr)
CARD_BUS_ID(card), card->info.mcl_level);
 }
 
-static struct qeth_card *qeth_alloc_card(void)
+static struct qeth_card *qeth_alloc_card(struct ccwgroup_device *gdev)
 {
struct qeth_card *card;
 
@@ -1504,6 +1513,11 @@ static struct qeth_card *qeth_alloc_card(void)
if (!card)
goto out;
QETH_DBF_HEX(SETUP, 2, , sizeof(void *));
+
+   card->gdev = gdev;
+   CARD_RDEV(card) = gdev->cdev[0];
+   CARD_WDEV(card) = gdev->cdev[1];
+   CARD_DDEV(card) = gdev->cdev[2];
if (qeth_setup_channel(>read, true))
 

[PATCH net-next 02/15] s390/qeth: run non-offload L3 traffic over common xmit path

2018-09-17 Thread Julian Wiedmann
L3 OSAs can only offload IPv4 traffic, use the common L2 transmit path
for all other traffic.
In particular there's no support for TX VLAN offload, so any such packet
needs to be manually de-accelerated via ndo_features_check().

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_l3_main.c | 70 ++---
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index ada258c01a08..2733eb901b04 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -1983,21 +1983,23 @@ static int qeth_l3_get_cast_type(struct sk_buff *skb)
rcu_read_unlock();
 
/* no neighbour (eg AF_PACKET), fall back to target's IP address ... */
-   if (be16_to_cpu(skb->protocol) == ETH_P_IPV6)
-   return ipv6_addr_is_multicast(_hdr(skb)->daddr) ?
-   RTN_MULTICAST : RTN_UNICAST;
-   else if (be16_to_cpu(skb->protocol) == ETH_P_IP)
+   switch (qeth_get_ip_version(skb)) {
+   case 4:
return ipv4_is_multicast(ip_hdr(skb)->daddr) ?
RTN_MULTICAST : RTN_UNICAST;
-
-   /* ... and MAC address */
-   if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, skb->dev->broadcast))
-   return RTN_BROADCAST;
-   if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
-   return RTN_MULTICAST;
-
-   /* default to unicast */
-   return RTN_UNICAST;
+   case 6:
+   return ipv6_addr_is_multicast(_hdr(skb)->daddr) ?
+   RTN_MULTICAST : RTN_UNICAST;
+   default:
+   /* ... and MAC address */
+   if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest,
+   skb->dev->broadcast))
+   return RTN_BROADCAST;
+   if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
+   return RTN_MULTICAST;
+   /* default to unicast */
+   return RTN_UNICAST;
+   }
 }
 
 static void qeth_l3_fill_af_iucv_hdr(struct qeth_hdr *hdr, struct sk_buff *skb,
@@ -2034,20 +2036,21 @@ static void qeth_l3_fill_header(struct qeth_card *card, 
struct qeth_hdr *hdr,
struct sk_buff *skb, int ipv, int cast_type,
unsigned int data_len)
 {
+   struct vlan_ethhdr *veth = vlan_eth_hdr(skb);
+
memset(hdr, 0, sizeof(struct qeth_hdr));
hdr->hdr.l3.id = QETH_HEADER_TYPE_LAYER3;
hdr->hdr.l3.length = data_len;
 
-   /*
-* before we're going to overwrite this location with next hop ip.
-* v6 uses passthrough, v4 sets the tag in the QDIO header.
-*/
-   if (skb_vlan_tag_present(skb)) {
-   if ((ipv == 4) || (card->info.type == QETH_CARD_TYPE_IQD))
-   hdr->hdr.l3.ext_flags = QETH_HDR_EXT_VLAN_FRAME;
-   else
-   hdr->hdr.l3.ext_flags = QETH_HDR_EXT_INCLUDE_VLAN_TAG;
-   hdr->hdr.l3.vlan_id = skb_vlan_tag_get(skb);
+   if (ipv == 4 || IS_IQD(card)) {
+   /* NETIF_F_HW_VLAN_CTAG_TX */
+   if (skb_vlan_tag_present(skb)) {
+   hdr->hdr.l3.ext_flags |= QETH_HDR_EXT_VLAN_FRAME;
+   hdr->hdr.l3.vlan_id = skb_vlan_tag_get(skb);
+   }
+   } else if (veth->h_vlan_proto == htons(ETH_P_8021Q)) {
+   hdr->hdr.l3.ext_flags |= QETH_HDR_EXT_INCLUDE_VLAN_TAG;
+   hdr->hdr.l3.vlan_id = ntohs(veth->h_vlan_TCI);
}
 
if (!skb_is_gso(skb) && skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -2373,8 +2376,11 @@ static netdev_tx_t qeth_l3_hard_start_xmit(struct 
sk_buff *skb,
 
if (IS_IQD(card) || (!skb_is_gso(skb) && ipv == 4))
rc = qeth_l3_xmit_offload(card, skb, queue, ipv, cast_type);
-   else
+   else if (skb_is_gso(skb))
rc = qeth_l3_xmit(card, skb, queue, ipv, cast_type);
+   else
+   rc = qeth_xmit(card, skb, queue, ipv, cast_type,
+  qeth_l3_fill_header);
 
if (!rc) {
card->stats.tx_packets++;
@@ -2476,6 +2482,15 @@ qeth_l3_neigh_setup(struct net_device *dev, struct 
neigh_parms *np)
return 0;
 }
 
+static netdev_features_t qeth_l3_osa_features_check(struct sk_buff *skb,
+   struct net_device *dev,
+   netdev_features_t features)
+{
+   if (qeth_get_ip_version(skb) != 4)
+   features &= ~NETIF_F_HW_VLAN_CTAG_TX;
+   return qeth_features_check(skb, dev, features);
+}
+
 static const struct net_device_ops qeth_l3_netdev_ops = {
.ndo_open   = qeth_l3_open,
.ndo_stop   = qeth_l3_stop,
@@ -2496,7 +2511,7 @@ static const struct net_device_ops qeth_l3_osa_netdev_ops 
= {

[PATCH net-next 03/15] s390/qeth: remove unused L3 xmit code

2018-09-17 Thread Julian Wiedmann
qeth_l3_xmit() is now only used for TSOv4 traffic, shrink it down.

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_l3_main.c | 71 ++---
 1 file changed, 17 insertions(+), 54 deletions(-)

diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 2733eb901b04..00e6e7471f5d 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2230,44 +2230,24 @@ static int qeth_l3_xmit_offload(struct qeth_card *card, 
struct sk_buff *skb,
 static int qeth_l3_xmit(struct qeth_card *card, struct sk_buff *skb,
struct qeth_qdio_out_q *queue, int ipv, int cast_type)
 {
-   int elements, len, rc;
-   __be16 *tag;
struct qeth_hdr *hdr = NULL;
-   int hdr_elements = 0;
struct sk_buff *new_skb = NULL;
int tx_bytes = skb->len;
unsigned int hd_len;
-   bool use_tso, is_sg;
-
-   /* Ignore segment size from skb_is_gso(), 1 page is always used. */
-   use_tso = skb_is_gso(skb) &&
- (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4);
+   int elements, rc;
+   bool is_sg;
 
/* create a clone with writeable headroom */
-   new_skb = skb_realloc_headroom(skb, sizeof(struct qeth_hdr_tso) +
-   VLAN_HLEN);
+   new_skb = skb_realloc_headroom(skb, sizeof(struct qeth_hdr_tso));
if (!new_skb)
return -ENOMEM;
 
-   if (ipv == 4) {
-   skb_pull(new_skb, ETH_HLEN);
-   } else if (skb_vlan_tag_present(new_skb)) {
-   skb_push(new_skb, VLAN_HLEN);
-   skb_copy_to_linear_data(new_skb, new_skb->data + 4, 4);
-   skb_copy_to_linear_data_offset(new_skb, 4,
-  new_skb->data + 8, 4);
-   skb_copy_to_linear_data_offset(new_skb, 8,
-  new_skb->data + 12, 4);
-   tag = (__be16 *)(new_skb->data + 12);
-   *tag = cpu_to_be16(ETH_P_8021Q);
-   *(tag + 1) = cpu_to_be16(skb_vlan_tag_get(new_skb));
-   }
+   skb_pull(new_skb, ETH_HLEN);
 
/* fix hardware limitation: as long as we do not have sbal
 * chaining we can not send long frag lists
 */
-   if ((use_tso && !qeth_l3_get_elements_no_tso(card, new_skb, 1)) ||
-   (!use_tso && !qeth_get_elements_no(card, new_skb, 0, 0))) {
+   if (!qeth_l3_get_elements_no_tso(card, new_skb, 1)) {
rc = skb_linearize(new_skb);
 
if (card->options.performance_stats) {
@@ -2280,38 +2260,23 @@ static int qeth_l3_xmit(struct qeth_card *card, struct 
sk_buff *skb,
goto out;
}
 
-   if (use_tso) {
-   hdr = skb_push(new_skb, sizeof(struct qeth_hdr_tso));
-   memset(hdr, 0, sizeof(struct qeth_hdr_tso));
-   qeth_l3_fill_header(card, hdr, new_skb, ipv, cast_type,
-   new_skb->len - sizeof(struct qeth_hdr_tso));
-   qeth_tso_fill_header(card, hdr, new_skb);
-   hdr_elements++;
-   } else {
-   hdr = skb_push(new_skb, sizeof(struct qeth_hdr));
-   qeth_l3_fill_header(card, hdr, new_skb, ipv, cast_type,
-   new_skb->len - sizeof(struct qeth_hdr));
-   }
+   hdr = skb_push(new_skb, sizeof(struct qeth_hdr_tso));
+   memset(hdr, 0, sizeof(struct qeth_hdr_tso));
+   qeth_l3_fill_header(card, hdr, new_skb, ipv, cast_type,
+   new_skb->len - sizeof(struct qeth_hdr_tso));
+   qeth_tso_fill_header(card, hdr, new_skb);
 
-   elements = use_tso ?
-  qeth_l3_get_elements_no_tso(card, new_skb, hdr_elements) :
-  qeth_get_elements_no(card, new_skb, hdr_elements, 0);
+   elements = qeth_l3_get_elements_no_tso(card, new_skb, 1);
if (!elements) {
rc = -E2BIG;
goto out;
}
-   elements += hdr_elements;
+   elements++;
 
-   if (use_tso) {
-   hd_len = sizeof(struct qeth_hdr_tso) +
-ip_hdrlen(new_skb) + tcp_hdrlen(new_skb);
-   len = hd_len;
-   } else {
-   hd_len = 0;
-   len = sizeof(struct qeth_hdr_layer3);
-   }
+   hd_len = sizeof(struct qeth_hdr_tso) + ip_hdrlen(new_skb) +
+tcp_hdrlen(new_skb);
 
-   if (qeth_hdr_chk_and_bounce(new_skb, , len)) {
+   if (qeth_hdr_chk_and_bounce(new_skb, , hd_len)) {
rc = -EINVAL;
goto out;
}
@@ -2327,10 +2292,8 @@ static int qeth_l3_xmit(struct qeth_card *card, struct 
sk_buff *skb,
card->perf_stats.buf_elements_sent += elements;
if (is_sg)
card->perf_stats.sg_skbs_sent++;
-   if (use_tso) {
- 

[PATCH net-next 06/15] s390/qeth: fix up protocol headers early

2018-09-17 Thread Julian Wiedmann
When qeth_add_hw_header() falls back to the HW header cache, it also
copies over the necessary protocol headers. Thus any manipulation to
the protocol headers needs to happen before adding the HW header.

For current usage this doesn't matter, but it becomes relevant when
moving TSO transmission over to the faster code path.

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_l3_main.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index d4a967077279..0c085f57f7e2 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2056,10 +2056,8 @@ static void qeth_l3_fill_header(struct qeth_card *card, 
struct qeth_hdr *hdr,
if (!skb_is_gso(skb) && skb->ip_summed == CHECKSUM_PARTIAL) {
qeth_tx_csum(skb, >hdr.l3.ext_flags, ipv);
/* some HW requires combined L3+L4 csum offload: */
-   if (ipv == 4) {
+   if (ipv == 4)
hdr->hdr.l3.ext_flags |= QETH_HDR_EXT_CSUM_HDR_REQ;
-   ip_hdr(skb)->check = 0;
-   }
if (card->options.performance_stats)
card->perf_stats.tx_csum++;
}
@@ -2168,6 +2166,15 @@ static int qeth_l3_get_elements_no_tso(struct qeth_card 
*card,
return elements;
 }
 
+static void qeth_l3_fixup_headers(struct sk_buff *skb)
+{
+   struct iphdr *iph = ip_hdr(skb);
+
+   /* this is safe, IPv6 traffic takes a different path */
+   if (skb->ip_summed == CHECKSUM_PARTIAL)
+   iph->check = 0;
+}
+
 static int qeth_l3_xmit_offload(struct qeth_card *card, struct sk_buff *skb,
struct qeth_qdio_out_q *queue, int ipv,
int cast_type)
@@ -2188,6 +2195,7 @@ static int qeth_l3_xmit_offload(struct qeth_card *card, 
struct sk_buff *skb,
skb_pull(skb, ETH_HLEN);
frame_len = skb->len;
 
+   qeth_l3_fixup_headers(skb);
push_len = qeth_add_hw_header(card, skb, , hw_hdr_len, 0,
  );
if (push_len < 0)
-- 
2.16.4



[PATCH net-next 05/15] s390/qeth: limit csum offload erratum to L3 devices

2018-09-17 Thread Julian Wiedmann
Combined L3+L4 csum offload is only required for some L3 HW. So for
L2 devices, don't offload the IP header csum calculation.

Signed-off-by: Julian Wiedmann 
Reference-ID: JUP 394553
---
 drivers/s390/net/qeth_core.h| 5 -
 drivers/s390/net/qeth_l3_main.c | 5 +
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 0857b1286660..b47fb95a49e9 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -892,11 +892,6 @@ static inline void qeth_tx_csum(struct sk_buff *skb, u8 
*flags, int ipv)
if ((ipv == 4 && ip_hdr(skb)->protocol == IPPROTO_UDP) ||
(ipv == 6 && ipv6_hdr(skb)->nexthdr == IPPROTO_UDP))
*flags |= QETH_HDR_EXT_UDP;
-   if (ipv == 4) {
-   /* some HW requires combined L3+L4 csum offload: */
-   *flags |= QETH_HDR_EXT_CSUM_HDR_REQ;
-   ip_hdr(skb)->check = 0;
-   }
 }
 
 static inline void qeth_put_buffer_pool_entry(struct qeth_card *card,
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 1d92584e01b3..d4a967077279 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2055,6 +2055,11 @@ static void qeth_l3_fill_header(struct qeth_card *card, 
struct qeth_hdr *hdr,
 
if (!skb_is_gso(skb) && skb->ip_summed == CHECKSUM_PARTIAL) {
qeth_tx_csum(skb, >hdr.l3.ext_flags, ipv);
+   /* some HW requires combined L3+L4 csum offload: */
+   if (ipv == 4) {
+   hdr->hdr.l3.ext_flags |= QETH_HDR_EXT_CSUM_HDR_REQ;
+   ip_hdr(skb)->check = 0;
+   }
if (card->options.performance_stats)
card->perf_stats.tx_csum++;
}
-- 
2.16.4



[PATCH net-next 10/15] s390/qeth: remove qeth_hdr_chk_and_bounce()

2018-09-17 Thread Julian Wiedmann
Restructure the OSN xmit path to handle misaligned HW headers properly,
without shifting the packet data around.

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_core.h  |  1 -
 drivers/s390/net/qeth_core_main.c | 21 -
 drivers/s390/net/qeth_l2_main.c   | 37 -
 3 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 1c9fce609eb9..be213b5c2552 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -1021,7 +1021,6 @@ void qeth_dbf_longtext(debug_info_t *id, int level, char 
*text, ...);
 int qeth_core_ethtool_get_link_ksettings(struct net_device *netdev,
 struct ethtool_link_ksettings *cmd);
 int qeth_set_access_ctrl_online(struct qeth_card *card, int fallback);
-int qeth_hdr_chk_and_bounce(struct sk_buff *, struct qeth_hdr **, int);
 int qeth_configure_cq(struct qeth_card *, enum qeth_cq);
 int qeth_hw_trap(struct qeth_card *, enum qeth_diags_trap_action);
 void qeth_trace_features(struct qeth_card *);
diff --git a/drivers/s390/net/qeth_core_main.c 
b/drivers/s390/net/qeth_core_main.c
index 7426167eace2..c7f7061a7205 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -3823,27 +3823,6 @@ unsigned int qeth_count_elements(struct sk_buff *skb, 
unsigned int data_offset)
 }
 EXPORT_SYMBOL_GPL(qeth_count_elements);
 
-int qeth_hdr_chk_and_bounce(struct sk_buff *skb, struct qeth_hdr **hdr, int 
len)
-{
-   int hroom, inpage, rest;
-
-   if (((unsigned long)skb->data & PAGE_MASK) !=
-   (((unsigned long)skb->data + len - 1) & PAGE_MASK)) {
-   hroom = skb_headroom(skb);
-   inpage = PAGE_SIZE - ((unsigned long) skb->data % PAGE_SIZE);
-   rest = len - inpage;
-   if (rest > hroom)
-   return 1;
-   memmove(skb->data - rest, skb->data, skb_headlen(skb));
-   skb->data -= rest;
-   skb->tail -= rest;
-   *hdr = (struct qeth_hdr *)skb->data;
-   QETH_DBF_MESSAGE(2, "skb bounce len: %d rest: %d\n", len, rest);
-   }
-   return 0;
-}
-EXPORT_SYMBOL_GPL(qeth_hdr_chk_and_bounce);
-
 #define QETH_HDR_CACHE_OBJ_SIZE(sizeof(struct qeth_hdr_tso) + \
 MAX_TCP_HEADER)
 
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 24b531ca2827..33b65471a68a 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -650,19 +650,38 @@ static void qeth_l2_set_rx_mode(struct net_device *dev)
 static int qeth_l2_xmit_osn(struct qeth_card *card, struct sk_buff *skb,
struct qeth_qdio_out_q *queue)
 {
-   unsigned int elements;
-   struct qeth_hdr *hdr;
+   struct qeth_hdr *hdr = (struct qeth_hdr *)skb->data;
+   addr_t end = (addr_t)(skb->data + sizeof(*hdr));
+   addr_t start = (addr_t)skb->data;
+   unsigned int elements = 0;
+   unsigned int hd_len = 0;
+   int rc;
 
if (skb->protocol == htons(ETH_P_IPV6))
return -EPROTONOSUPPORT;
 
-   hdr = (struct qeth_hdr *)skb->data;
-   elements = qeth_count_elements(skb, 0);
-   if (elements > QETH_MAX_BUFFER_ELEMENTS(card))
-   return -E2BIG;
-   if (qeth_hdr_chk_and_bounce(skb, , sizeof(*hdr)))
-   return -EINVAL;
-   return qeth_do_send_packet(card, queue, skb, hdr, 0, 0, elements);
+   if (qeth_get_elements_for_range(start, end) > 1) {
+   /* Misaligned HW header, move it to its own buffer element. */
+   hdr = kmem_cache_alloc(qeth_core_header_cache, GFP_ATOMIC);
+   if (!hdr)
+   return -ENOMEM;
+   hd_len = sizeof(*hdr);
+   skb_copy_from_linear_data(skb, (char *)hdr, hd_len);
+   elements++;
+   }
+
+   elements += qeth_count_elements(skb, hd_len);
+   if (elements > QETH_MAX_BUFFER_ELEMENTS(card)) {
+   rc = -E2BIG;
+   goto out;
+   }
+
+   rc = qeth_do_send_packet(card, queue, skb, hdr, hd_len, hd_len,
+elements);
+out:
+   if (rc && hd_len)
+   kmem_cache_free(qeth_core_header_cache, hdr);
+   return rc;
 }
 
 static netdev_tx_t qeth_l2_hard_start_xmit(struct sk_buff *skb,
-- 
2.16.4



[PATCH net-next 09/15] s390/qeth: speed up TSO transmission

2018-09-17 Thread Julian Wiedmann
Switch TSO over to the faster transmit path, and remove all the unused
old TSO code.

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_core.h  |   1 -
 drivers/s390/net/qeth_core_main.c |   3 +-
 drivers/s390/net/qeth_l3_main.c   | 151 ++
 3 files changed, 6 insertions(+), 149 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index d86eea9db2a7..1c9fce609eb9 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -1004,7 +1004,6 @@ int qeth_send_control_data(struct qeth_card *, int, 
struct qeth_cmd_buffer *,
int (*reply_cb)(struct qeth_card *, struct qeth_reply*, unsigned long),
void *reply_param);
 unsigned int qeth_count_elements(struct sk_buff *skb, unsigned int 
data_offset);
-int qeth_get_elements_for_frags(struct sk_buff *);
 int qeth_do_send_packet_fast(struct qeth_qdio_out_q *queue, struct sk_buff 
*skb,
 struct qeth_hdr *hdr, unsigned int offset,
 unsigned int hd_len);
diff --git a/drivers/s390/net/qeth_core_main.c 
b/drivers/s390/net/qeth_core_main.c
index 460ffdf1b200..7426167eace2 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -3788,7 +3788,7 @@ EXPORT_SYMBOL_GPL(qeth_get_priority_queue);
  * Returns the number of pages, and thus QDIO buffer elements, needed to cover
  * fragmented part of the SKB. Returns zero for linear SKB.
  */
-int qeth_get_elements_for_frags(struct sk_buff *skb)
+static int qeth_get_elements_for_frags(struct sk_buff *skb)
 {
int cnt, elements = 0;
 
@@ -3801,7 +3801,6 @@ int qeth_get_elements_for_frags(struct sk_buff *skb)
}
return elements;
 }
-EXPORT_SYMBOL_GPL(qeth_get_elements_for_frags);
 
 /**
  * qeth_count_elements() - Counts the number of QDIO buffer elements needed
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index fe55218802d7..5d7e2921ab36 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -33,7 +33,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
@@ -2118,70 +2117,6 @@ static void qeth_l3_fill_tso_ext(struct qeth_hdr_tso 
*hdr,
ext->dg_hdr_len = proto_len;
 }
 
-static void qeth_tso_fill_header(struct qeth_card *card,
-   struct qeth_hdr *qhdr, struct sk_buff *skb)
-{
-   struct qeth_hdr_tso *hdr = (struct qeth_hdr_tso *)qhdr;
-   struct tcphdr *tcph = tcp_hdr(skb);
-   struct iphdr *iph = ip_hdr(skb);
-   struct ipv6hdr *ip6h = ipv6_hdr(skb);
-
-   /*set values which are fix for the first approach ...*/
-   hdr->ext.hdr_tot_len = (__u16) sizeof(struct qeth_hdr_ext_tso);
-   hdr->ext.imb_hdr_no  = 1;
-   hdr->ext.hdr_type= 1;
-   hdr->ext.hdr_version = 1;
-   hdr->ext.hdr_len = 28;
-   /*insert non-fix values */
-   hdr->ext.mss = skb_shinfo(skb)->gso_size;
-   hdr->ext.dg_hdr_len = (__u16)(ip_hdrlen(skb) + tcp_hdrlen(skb));
-   hdr->ext.payload_len = (__u16)(skb->len - hdr->ext.dg_hdr_len -
-  sizeof(struct qeth_hdr_tso));
-   tcph->check = 0;
-   if (be16_to_cpu(skb->protocol) == ETH_P_IPV6) {
-   ip6h->payload_len = 0;
-   tcph->check = ~csum_ipv6_magic(>saddr, >daddr,
-  0, IPPROTO_TCP, 0);
-   } else {
-   /*OSA want us to set these values ...*/
-   tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
-0, IPPROTO_TCP, 0);
-   iph->tot_len = 0;
-   iph->check = 0;
-   }
-}
-
-/**
- * qeth_l3_get_elements_no_tso() - find number of SBALEs for skb data for tso
- * @card: qeth card structure, to check max. elems.
- * @skb:  SKB address
- * @extra_elems:  extra elems needed, to check against max.
- *
- * Returns the number of pages, and thus QDIO buffer elements, needed to cover
- * skb data, including linear part and fragments, but excluding TCP header.
- * Checks if the result plus extra_elems fits under the limit for the card.
- * Returns 0 if it does not.
- * Note: extra_elems is not included in the returned result.
- */
-static int qeth_l3_get_elements_no_tso(struct qeth_card *card,
-   struct sk_buff *skb, int extra_elems)
-{
-   addr_t start = (addr_t)tcp_hdr(skb) + tcp_hdrlen(skb);
-   addr_t end = (addr_t)skb->data + skb_headlen(skb);
-   int elements = qeth_get_elements_for_frags(skb);
-
-   if (start != end)
-   elements += qeth_get_elements_for_range(start, end);
-
-   if ((elements + extra_elems) > QETH_MAX_BUFFER_ELEMENTS(card)) {
-   QETH_DBF_MESSAGE(2,
-   "Invalid size of TSO IP packet (Number=%d / Length=%d). Discarded.\n",
-   elements + extra_elems, 

[PATCH net-next 14/15] s390/qeth: fine-tune spinlocks

2018-09-17 Thread Julian Wiedmann
For quite a lot of code paths it's obvious that they will never run in
IRQ context. So replace their spin_lock_irqsave() calls with
spin_lock_irq().

While at it, get rid of the redundant card pointer in struct qeth_reply
that was used by qeth_send_control_data() to access the card's lock.

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_core.h  |  1 -
 drivers/s390/net/qeth_core_main.c | 53 ---
 drivers/s390/net/qeth_l2_main.c   |  5 ++--
 3 files changed, 24 insertions(+), 35 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index be213b5c2552..0dbe81f958f0 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -639,7 +639,6 @@ struct qeth_reply {
atomic_t received;
int rc;
void *param;
-   struct qeth_card *card;
refcount_t refcnt;
 };
 
diff --git a/drivers/s390/net/qeth_core_main.c 
b/drivers/s390/net/qeth_core_main.c
index f09bef4a49ca..100b6f4a3fb8 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -592,7 +592,6 @@ static struct qeth_reply *qeth_alloc_reply(struct qeth_card 
*card)
if (reply) {
refcount_set(>refcnt, 1);
atomic_set(>received, 0);
-   reply->card = card;
}
return reply;
 }
@@ -1541,15 +1540,14 @@ static struct qeth_card *qeth_alloc_card(struct 
ccwgroup_device *gdev)
 
 static int qeth_clear_channel(struct qeth_channel *channel)
 {
-   unsigned long flags;
struct qeth_card *card;
int rc;
 
card = CARD_FROM_CDEV(channel->ccwdev);
QETH_CARD_TEXT(card, 3, "clearch");
-   spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags);
+   spin_lock_irq(get_ccwdev_lock(channel->ccwdev));
rc = ccw_device_clear(channel->ccwdev, QETH_CLEAR_CHANNEL_PARM);
-   spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags);
+   spin_unlock_irq(get_ccwdev_lock(channel->ccwdev));
 
if (rc)
return rc;
@@ -1565,15 +1563,14 @@ static int qeth_clear_channel(struct qeth_channel 
*channel)
 
 static int qeth_halt_channel(struct qeth_channel *channel)
 {
-   unsigned long flags;
struct qeth_card *card;
int rc;
 
card = CARD_FROM_CDEV(channel->ccwdev);
QETH_CARD_TEXT(card, 3, "haltch");
-   spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags);
+   spin_lock_irq(get_ccwdev_lock(channel->ccwdev));
rc = ccw_device_halt(channel->ccwdev, QETH_HALT_CHANNEL_PARM);
-   spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags);
+   spin_unlock_irq(get_ccwdev_lock(channel->ccwdev));
 
if (rc)
return rc;
@@ -1667,7 +1664,6 @@ static int qeth_read_conf_data(struct qeth_card *card, 
void **buffer,
char *rcd_buf;
int ret;
struct qeth_channel *channel = >data;
-   unsigned long flags;
 
/*
 * scan for RCD command in extended SenseID data
@@ -1681,11 +1677,11 @@ static int qeth_read_conf_data(struct qeth_card *card, 
void **buffer,
 
qeth_setup_ccw(channel->ccw, ciw->cmd, ciw->count, rcd_buf);
channel->state = CH_STATE_RCD;
-   spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags);
+   spin_lock_irq(get_ccwdev_lock(channel->ccwdev));
ret = ccw_device_start_timeout(channel->ccwdev, channel->ccw,
   QETH_RCD_PARM, LPM_ANYPATH, 0,
   QETH_RCD_TIMEOUT);
-   spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags);
+   spin_unlock_irq(get_ccwdev_lock(channel->ccwdev));
if (!ret)
wait_event(card->wait_q,
   (channel->state == CH_STATE_RCD_DONE ||
@@ -1843,7 +1839,6 @@ static int qeth_idx_activate_get_answer(struct 
qeth_channel *channel,
struct qeth_cmd_buffer *))
 {
struct qeth_cmd_buffer *iob;
-   unsigned long flags;
int rc;
struct qeth_card *card;
 
@@ -1858,10 +1853,10 @@ static int qeth_idx_activate_get_answer(struct 
qeth_channel *channel,
wait_event(card->wait_q,
   atomic_cmpxchg(>irq_pending, 0, 1) == 0);
QETH_DBF_TEXT(SETUP, 6, "noirqpnd");
-   spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags);
+   spin_lock_irq(get_ccwdev_lock(channel->ccwdev));
rc = ccw_device_start_timeout(channel->ccwdev, channel->ccw,
  (addr_t) iob, 0, 0, QETH_TIMEOUT);
-   spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags);
+   spin_unlock_irq(get_ccwdev_lock(channel->ccwdev));
 
if (rc) {
QETH_DBF_MESSAGE(2, "Error2 in activating channel rc=%d\n", rc);
@@ -1888,7 +1883,6 @@ static int qeth_idx_activate_channel(struct qeth_channel 
*channel,
 {
struct qeth_card *card;
struct qeth_cmd_buffer *iob;
-  

[PATCH net-next 15/15] s390/qeth: reduce 0-initializing when building IPA cmds

2018-09-17 Thread Julian Wiedmann
qeth_get_ipacmd_buffer() obtains its buffers for building IPA cmds from
__qeth_get_buffer(), where they are fully cleared. So get rid of all the
additional zero-ing in various other places.

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_core_main.c | 15 +--
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/s390/net/qeth_core_main.c 
b/drivers/s390/net/qeth_core_main.c
index 100b6f4a3fb8..89e09e7b8fff 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -780,7 +780,6 @@ void qeth_release_buffer(struct qeth_channel *channel,
 
QETH_CARD_TEXT(CARD_FROM_CDEV(channel->ccwdev), 6, "relbuff");
spin_lock_irqsave(>iob_lock, flags);
-   memset(iob->data, 0, QETH_BUFSIZE);
iob->state = BUF_STATE_FREE;
iob->callback = qeth_send_control_data_cb;
iob->rc = 0;
@@ -1334,8 +1333,8 @@ static int qeth_setup_channel(struct qeth_channel 
*channel, bool alloc_buffers)
return 0;
 
for (cnt = 0; cnt < QETH_CMD_BUFFER_NO; cnt++) {
-   channel->iob[cnt].data =
-   kzalloc(QETH_BUFSIZE, GFP_DMA|GFP_KERNEL);
+   channel->iob[cnt].data = kmalloc(QETH_BUFSIZE,
+GFP_KERNEL | GFP_DMA);
if (channel->iob[cnt].data == NULL)
break;
channel->iob[cnt].state = BUF_STATE_FREE;
@@ -2888,10 +2887,10 @@ static __u8 qeth_get_ipa_adp_type(enum qeth_link_types 
link_type)
 }
 
 static void qeth_fill_ipacmd_header(struct qeth_card *card,
-   struct qeth_ipa_cmd *cmd, __u8 command,
-   enum qeth_prot_versions prot)
+   struct qeth_ipa_cmd *cmd,
+   enum qeth_ipa_cmds command,
+   enum qeth_prot_versions prot)
 {
-   memset(cmd, 0, sizeof(struct qeth_ipa_cmd));
cmd->hdr.command = command;
cmd->hdr.initiator = IPA_CMD_INITIATOR_HOST;
/* cmd->hdr.seqno is set by qeth_send_control_data() */
@@ -2903,8 +2902,6 @@ static void qeth_fill_ipacmd_header(struct qeth_card 
*card,
cmd->hdr.prim_version_no = 1;
cmd->hdr.param_count = 1;
cmd->hdr.prot_version = prot;
-   cmd->hdr.ipa_supported = 0;
-   cmd->hdr.ipa_enabled = 0;
 }
 
 struct qeth_cmd_buffer *qeth_get_ipacmd_buffer(struct qeth_card *card,
@@ -5494,8 +5491,6 @@ struct qeth_cmd_buffer *qeth_get_setassparms_cmd(struct 
qeth_card *card,
cmd->data.setassparms.hdr.assist_no = ipa_func;
cmd->data.setassparms.hdr.length = 8 + len;
cmd->data.setassparms.hdr.command_code = cmd_code;
-   cmd->data.setassparms.hdr.return_code = 0;
-   cmd->data.setassparms.hdr.seq_no = 0;
}
 
return iob;
-- 
2.16.4



[PATCH] net: emac: fix fixed-link setup for the RTL8363SB switch

2018-09-17 Thread Christian Lamparter
On the Netgear WNDAP620, the emac ethernet isn't receiving nor
xmitting any frames from/to the RTL8363SB (identifies itself
as a RTL8367RB).

This is caused by the emac hardware not knowing the forced link
parameters for speed, duplex, pause, etc.

This begs the question, how this was working on the original
driver code, when it was necessary to set the phy_address and
phy_map to 0x. But I guess without access to the old
PPC405/440/460 hardware, it's not possible to know.

Signed-off-by: Christian Lamparter 
---
 drivers/net/ethernet/ibm/emac/core.c | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/core.c 
b/drivers/net/ethernet/ibm/emac/core.c
index 354c0982847b..3b398ebdb5e6 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -2677,12 +2677,17 @@ static int emac_init_phy(struct emac_instance *dev)
if (of_phy_is_fixed_link(np)) {
int res = emac_dt_mdio_probe(dev);
 
-   if (!res) {
-   res = of_phy_register_fixed_link(np);
-   if (res)
-   mdiobus_unregister(dev->mii_bus);
+   if (res)
+   return res;
+
+   res = of_phy_register_fixed_link(np);
+   dev->phy_dev = of_phy_find_device(np);
+   if (res || !dev->phy_dev) {
+   mdiobus_unregister(dev->mii_bus);
+   return res ? res : -EINVAL;
}
-   return res;
+   emac_adjust_link(dev->ndev);
+   put_device(>phy_dev->mdio.dev);
}
return 0;
}
-- 
2.19.0.rc2



Re: [PATCH net-next] liquidio: Add the features to show FEC settings and set FEC settings

2018-09-17 Thread David Miller
From: Felix Manlunas 
Date: Sun, 16 Sep 2018 22:43:32 -0700

> From: Weilin Chang 
> 
> 1. Add functions for get_fecparam and set_fecparam.
> 2. Modify lio_get_link_ksettings to display FEC setting.
> 
> Signed-off-by: Weilin Chang 
> Acked-by: Derek Chickles 
> Signed-off-by: Felix Manlunas 

Applied.


Re: [PATCHv2 net-next 1/1] net: rds: use memset to optimize the recv

2018-09-17 Thread David Miller
From: Zhu Yanjun 
Date: Sun, 16 Sep 2018 22:49:30 -0400

> The function rds_inc_init is in recv process. To use memset can optimize
> the function rds_inc_init.
> The test result:
> 
>  Before:
>  1) + 24.950 us   |rds_inc_init [rds]();
>  After:
>  1) + 10.990 us   |rds_inc_init [rds]();
> 
> Acked-by: Santosh Shilimkar 
> Signed-off-by: Zhu Yanjun 
> ---
> V1->V2: a new patch for net-next

Applied.


Re: [PATCH v2 net] net: aquantia: memory corruption on jumbo frames

2018-09-17 Thread David Miller
From: Igor Russkikh 
Date: Sat, 15 Sep 2018 18:03:39 +0300

> From: Friedemann Gerold 
> 
> This patch fixes skb_shared area, which will be corrupted
> upon reception of 4K jumbo packets.
> 
> Originally build_skb usage purpose was to reuse page for skb to eliminate
> needs of extra fragments. But that logic does not take into account that
> skb_shared_info should be reserved at the end of skb data area.
> 
> In case packet data consumes all the page (4K), skb_shinfo location
> overflows the page. As a consequence, __build_skb zeroed shinfo data above
> the allocated page, corrupting next page.
> 
> The issue is rarely seen in real life because jumbo are normally larger
> than 4K and that causes another code path to trigger.
> But it 100% reproducible with simple scapy packet, like:
> 
> sendp(IP(dst="192.168.100.3") / TCP(dport=443) \
>   / Raw(RandString(size=(4096-40))), iface="enp1s0")
> 
> Fixes: 018423e90bee ("net: ethernet: aquantia: Add ring support code")
> 
> Reported-by: Friedemann Gerold 
> Reported-by: Michael Rauch 
> Signed-off-by: Friedemann Gerold 
> Tested-by: Nikita Danilov 
> Signed-off-by: Igor Russkikh 

APplied and queued up for -stable.


Re: [PATCH net-next 0/5] net: lantiq: Minor fixes for vrx200 and gswip

2018-09-17 Thread David Miller
From: Hauke Mehrtens 
Date: Sat, 15 Sep 2018 14:08:44 +0200

> These are mostly minor fixes to problems addresses in the latests round 
> of the review of the original series adding these driver, which were not 
> applied before the patches got merged into net-next.
> In addition it fixes a data bus error on poweroff.

Series applied.


Re: [PATCH] net: phy: phylink: fix SFP interface autodetection

2018-09-17 Thread Russell King - ARM Linux
On Mon, Sep 17, 2018 at 05:19:57PM +0300, Baruch Siach wrote:
> When the switching to the SFP detected link mode update the main
> link_interface field as well. Otherwise, the link fails to come up when
> the configured 'phy-mode' defers from the SFP detected mode.
> 
> This fixes 1GB SFP module link up on eth3 of the Macchiatobin board that
> is configured in the DT to "2500base-x" phy-mode.

link_interface isn't supposed to track the SFP link mode.  In any case,
this is only used when a PHY is attached.  For a PHY on a SFP,
phylink_connect_phy() should be using link_config.interface and not
link_interface there.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 13.8Mbps down 630kbps up
According to speedtest.net: 13Mbps down 490kbps up


Re: [PATCH v3 net-next 07/12] net: ethernet: Add helper to remove a supported link mode

2018-09-17 Thread Simon Horman
On Wed, Sep 12, 2018 at 01:53:14AM +0200, Andrew Lunn wrote:
> Some MAC hardware cannot support a subset of link modes. e.g. often
> 1Gbps Full duplex is supported, but Half duplex is not. Add a helper
> to remove such a link mode.
> 
> Signed-off-by: Andrew Lunn 
> Reviewed-by: Florian Fainelli 
> ---
>  drivers/net/ethernet/apm/xgene/xgene_enet_hw.c |  6 +++---
>  drivers/net/ethernet/cadence/macb_main.c   |  5 ++---
>  drivers/net/ethernet/freescale/fec_main.c  |  3 ++-
>  drivers/net/ethernet/microchip/lan743x_main.c  |  2 +-
>  drivers/net/ethernet/renesas/ravb_main.c   |  3 ++-
>  .../net/ethernet/stmicro/stmmac/stmmac_main.c  | 12 
>  drivers/net/phy/phy_device.c   | 18 ++
>  drivers/net/usb/lan78xx.c  |  2 +-
>  include/linux/phy.h|  1 +
>  9 files changed, 38 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c 
> b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
> index 078a04dc1182..4831f9de5945 100644

...

> diff --git a/drivers/net/ethernet/renesas/ravb_main.c 
> b/drivers/net/ethernet/renesas/ravb_main.c
> index aff5516b781e..fb2a1125780d 100644
> --- a/drivers/net/ethernet/renesas/ravb_main.c
> +++ b/drivers/net/ethernet/renesas/ravb_main.c
> @@ -1074,7 +1074,8 @@ static int ravb_phy_init(struct net_device *ndev)
>   }
>  
>   /* 10BASE is not supported */
> - phydev->supported &= ~PHY_10BT_FEATURES;
> + phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_10baseT_Half_BIT);
> + phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_10baseT_Full_BIT);
>  
>   phy_attached_info(phydev);
>  

...

> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> index db1172db1e7c..e9ca83a438b0 100644
> --- a/drivers/net/phy/phy_device.c
> +++ b/drivers/net/phy/phy_device.c
> @@ -1765,6 +1765,24 @@ int phy_set_max_speed(struct phy_device *phydev, u32 
> max_speed)
>  }
>  EXPORT_SYMBOL(phy_set_max_speed);
>  
> +/**
> + * phy_remove_link_mode - Remove a supported link mode
> + * @phydev: phy_device structure to remove link mode from
> + * @link_mode: Link mode to be removed
> + *
> + * Description: Some MACs don't support all link modes which the PHY
> + * does.  e.g. a 1G MAC often does not support 1000Half. Add a helper
> + * to remove a link mode.
> + */
> +void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode)
> +{
> + WARN_ON(link_mode > 31);
> +
> + phydev->supported &= ~BIT(link_mode);
> + phydev->advertising = phydev->supported;
> +}
> +EXPORT_SYMBOL(phy_remove_link_mode);
> +
>  static void of_set_phy_supported(struct phy_device *phydev)
>  {
>   struct device_node *node = phydev->mdio.dev.of_node;

Hi Andrew,

I believe that for the RAVB the overall effect of this change is that
10-BaseT modes are no longer advertised (although both with and without
this patch they are not supported).

Unfortunately on R-Car Gen3 M3-W (r8a7796) based Salvator-X board
I have observed that this results in the link no longer being negotiated
on one switch (the one I usually use) while it seemed fine on another.


[PATCH net] net/ipv4: defensive cipso option parsing

2018-09-17 Thread Stefan Nuernberger
commit 40413955ee26 ("Cipso: cipso_v4_optptr enter infinite loop") fixed
a possible infinite loop in the IP option parsing of CIPSO. The fix
assumes that ip_options_compile filtered out all zero length options and
that no other one-byte options beside IPOPT_END and IPOPT_NOOP exist.
While this assumption currently holds true, add explicit checks for zero
length and invalid length options to be safe for the future. Even though
ip_options_compile should have validated the options, the introduction of
new one-byte options can still confuse this code without the additional
checks.

Signed-off-by: Stefan Nuernberger 
Reviewed-by: David Woodhouse 
Reviewed-by: Simon Veith 
Cc: sta...@vger.kernel.org
---
 net/ipv4/cipso_ipv4.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 82178cc69c96..f291b57b8474 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1512,7 +1512,7 @@ static int cipso_v4_parsetag_loc(const struct 
cipso_v4_doi *doi_def,
  *
  * Description:
  * Parse the packet's IP header looking for a CIPSO option.  Returns a pointer
- * to the start of the CIPSO option on success, NULL if one if not found.
+ * to the start of the CIPSO option on success, NULL if one is not found.
  *
  */
 unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
@@ -1522,9 +1522,11 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
int optlen;
int taglen;
 
-   for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) {
+   for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) {
switch (optptr[0]) {
case IPOPT_CIPSO:
+   if (!optptr[1] || optptr[1] > optlen)
+   return NULL;
return optptr;
case IPOPT_END:
return NULL;
@@ -1534,6 +1536,10 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
default:
taglen = optptr[1];
}
+
+   if (!taglen || taglen > optlen)
+   break;
+
optlen -= taglen;
optptr += taglen;
}
-- 
2.19.0



Re: [PATCH net-next] net: hns: make function hns_gmac_wait_fifo_clean() static

2018-09-17 Thread David Miller
From: Wei Yongjun 
Date: Sat, 15 Sep 2018 01:42:09 +

> Fixes the following sparse warning:
> 
> drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c:322:5: warning:
>  symbol 'hns_gmac_wait_fifo_clean' was not declared. Should it be static?
> 
> Signed-off-by: Wei Yongjun 

Applied.


  1   2   >