date:20170206

[PATCH net-next V3 1/3] net/skbuff: Introduce skb_mac_offset()

2017-02-06 Thread Amir Vadai

Introduce skb_mac_offset() that could be used to get mac header offset.

Signed-off-by: Amir Vadai 
Reviewed-by: Or Gerlitz 
---
 include/linux/skbuff.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c6a78e1892b6..a1b73b794a38 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2182,6 +2182,11 @@ static inline unsigned char *skb_mac_header(const struct 
sk_buff *skb)
return skb->head + skb->mac_header;
 }
 
+static inline int skb_mac_offset(const struct sk_buff *skb)
+{
+   return skb_mac_header(skb) - skb->data;
+}
+
 static inline int skb_mac_header_was_set(const struct sk_buff *skb)
 {
return skb->mac_header != (typeof(skb->mac_header))~0U;
-- 
2.11.0

[PATCH net-next V3 2/3] net/act_pedit: Support using offset relative to the conventional network headers

2017-02-06 Thread Amir Vadai

Extend pedit to enable the user setting offset relative to network
headers. This change would enable to work with more complex header
schemes (vs the simple IPv4 case) where setting a fixed offset relative
to the network header is not enough.

After this patch, the action has information about the exact header type
and field inside this header. This information could be used later on
for hardware offloading of pedit.

Backward compatibility was being kept:
1. Old kernel <-> new userspace
2. New kernel <-> old userspace
3. add rule using new userspace <-> dump using old userspace
4. add rule using old userspace <-> dump using new userspace

When using the extended api, new netlink attributes are being used. This
way, operation will fail in (1) and (3) - and no malformed rule be added
or dumped. Of course, new user space that doesn't need the new
functionality can use the old netlink attributes and operation will
succeed.
Since action can support both api's, (2) should work, and it is easy to
write the new user space to have (4) work.

The action is having a strict check that only header types and commands
it can handle are accepted. This way future additions will be much
easier.

Usage example:
$ tc filter add dev enp0s9 protocol ip parent : \
  flower \
ip_proto tcp \
dst_port 80 \
  action pedit munge tcp dport set 8080 pipe \
  action mirred egress redirect dev veth0

Will forward tcp port whose original dest port is 80, while modifying
the destination port to 8080.

Signed-off-by: Amir Vadai 
Reviewed-by: Or Gerlitz 
---
 include/net/tc_act/tc_pedit.h|   5 +
 include/uapi/linux/tc_act/tc_pedit.h |  23 
 net/sched/act_pedit.c| 196 ---
 3 files changed, 208 insertions(+), 16 deletions(-)

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index 29e38d6823df..e076f22035a5 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -3,11 +3,16 @@
 
 #include 
 
+struct tcf_pedit_key_ex {
+   enum pedit_header_type htype;
+};
+
 struct tcf_pedit {
struct tc_actioncommon;
unsigned char   tcfp_nkeys;
unsigned char   tcfp_flags;
struct tc_pedit_key *tcfp_keys;
+   struct tcf_pedit_key_ex *tcfp_keys_ex;
 };
 #define to_pedit(a) ((struct tcf_pedit *)a)
 
diff --git a/include/uapi/linux/tc_act/tc_pedit.h 
b/include/uapi/linux/tc_act/tc_pedit.h
index 6389959a5157..22f19eeda997 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -11,10 +11,33 @@ enum {
TCA_PEDIT_TM,
TCA_PEDIT_PARMS,
TCA_PEDIT_PAD,
+   TCA_PEDIT_PARMS_EX,
+   TCA_PEDIT_KEYS_EX,
+   TCA_PEDIT_KEY_EX,
__TCA_PEDIT_MAX
 };
 #define TCA_PEDIT_MAX (__TCA_PEDIT_MAX - 1)

 
+enum {
+   TCA_PEDIT_KEY_EX_HTYPE = 1,
+   __TCA_PEDIT_KEY_EX_MAX
+};
+#define TCA_PEDIT_KEY_EX_MAX (__TCA_PEDIT_KEY_EX_MAX - 1)
+
+ /* TCA_PEDIT_KEY_EX_HDR_TYPE_NETWROK is a special case for legacy users. It
+  * means no specific header type - offset is relative to the network layer
+  */
+enum pedit_header_type {
+   TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
+   TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
+   TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
+   TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
+   TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
+   TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
+   __PEDIT_HDR_TYPE_MAX,
+};
+#define TCA_PEDIT_HDR_TYPE_MAX (__PEDIT_HDR_TYPE_MAX - 1)
+
 struct tc_pedit_key {
__u32   mask;  /* AND */
__u32   val;   /*XOR */
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b27c4daec88f..fdd012bd3602 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define PEDIT_TAB_MASK 15
 
@@ -30,18 +31,112 @@ static struct tc_action_ops act_pedit_ops;
 
 static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
[TCA_PEDIT_PARMS]   = { .len = sizeof(struct tc_pedit) },
+   [TCA_PEDIT_KEYS_EX]   = { .type = NLA_NESTED },
 };
 
+static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = 
{
+   [TCA_PEDIT_KEY_EX_HTYPE]  = { .type = NLA_U16 },
+};
+
+static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
+   u8 n)
+{
+   struct tcf_pedit_key_ex *keys_ex;
+   struct tcf_pedit_key_ex *k;
+   const struct nlattr *ka;
+   int err = -EINVAL;
+   int rem;
+
+   if (!nla || !n)
+   return NULL;
+
+   keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL);
+   if (!keys_ex)
+   return ERR_PTR(-ENOMEM);
+
+   k = keys_ex;
+
+   nla_for_each_nested(ka, nla, rem) {
+   struct nlattr

[PATCH net-next V3 0/3] net/sched: act_pedit: Use offset relative to conventional network headers

2017-02-06 Thread Amir Vadai

Hi Dave,

Some FW/HW parser APIs are such that they need to get the specific header type 
(e.g
IPV4 or IPV6, TCP or UDP) and not only the networking level (e.g network or 
transport).

Enhancing the UAPI to allow for specifying that, would allow the same flows to 
be
set into both SW and HW.

This patchset also makes pedit more robust. Currently fields offset is specified
by offset relative to the ip header, while using negative offsets for 
MAC layer fields.

This series enables the user to set offset relative to the relevant header.

Usage example:
$ tc filter add dev enp0s9 protocol ip parent : \
   flower \
 ip_proto tcp \
dst_port 80 \
   action \
   pedit munge ip ttl add 0xff \
   pedit munge tcp dport set 8080 \
 pipe action mirred egress redirect dev veth0

Will forward traffic destined to tcp dport 80, while modifying the
destination port to 8080, and decreasing the ttl by one.

I've uploaded a draft for the userspace [2] to make it easier to review and
test the patchset.

[1] - http://patchwork.ozlabs.org/patch/700909/
[2] - git: https://bitbucket.org/av42/iproute2.git
  branch: pedit

Patchset was tested and applied on top of upstream commit bd092ad1463c ("Merge
branch 'remove-__napi_complete_done'")

Thanks,
Amir

Changes since V2:
- Instead of reusing unused bits in existing uapi fields, using new netlink
attributes for the new information. This way new/old user space and 
new/old
kernel can live together without having misunderstandings.

Changes since V1:
- No changes - V1 was sent and didn't make it for 4.10.
- You asked me [1] why did I use specific header names instead of layers (L2,
L3...), and I explained that it is on purpose, this extra information is
planned to be used by hardware drivers to offload the action.


Amir Vadai (3):
  net/skbuff: Introduce skb_mac_offset()
  net/act_pedit: Support using offset relative to the conventional
network headers
  net/act_pedit: Introduce 'add' operation

 include/linux/skbuff.h   |   5 +
 include/net/tc_act/tc_pedit.h|   6 +
 include/uapi/linux/tc_act/tc_pedit.h |  31 +
 net/sched/act_pedit.c| 220 ---
 4 files changed, 245 insertions(+), 17 deletions(-)

-- 
2.11.0

[PATCH net-next V3 3/3] net/act_pedit: Introduce 'add' operation

2017-02-06 Thread Amir Vadai

This command could be useful to inc/dec fields.

For example, to forward any TCP packet and decrease its TTL:
$ tc filter add dev enp0s9 protocol ip parent : \
flower ip_proto tcp \
action pedit munge ip ttl add 0xff pipe \
action mirred egress redirect dev veth0

In the example above, adding 0xff to this u8 field is actually
decreasing it by one, since the operation is masked.

Signed-off-by: Amir Vadai 
Reviewed-by: Or Gerlitz 
---
 include/net/tc_act/tc_pedit.h|  1 +
 include/uapi/linux/tc_act/tc_pedit.h |  8 
 net/sched/act_pedit.c| 30 ++
 3 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index e076f22035a5..dfbd6ee0bc7c 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -5,6 +5,7 @@
 
 struct tcf_pedit_key_ex {
enum pedit_header_type htype;
+   enum pedit_cmd cmd;
 };
 
 struct tcf_pedit {
diff --git a/include/uapi/linux/tc_act/tc_pedit.h 
b/include/uapi/linux/tc_act/tc_pedit.h
index 22f19eeda997..143d2b31a316 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -20,6 +20,7 @@ enum {

 
 enum {
TCA_PEDIT_KEY_EX_HTYPE = 1,
+   TCA_PEDIT_KEY_EX_CMD = 2,
__TCA_PEDIT_KEY_EX_MAX
 };
 #define TCA_PEDIT_KEY_EX_MAX (__TCA_PEDIT_KEY_EX_MAX - 1)
@@ -38,6 +39,13 @@ enum pedit_header_type {
 };
 #define TCA_PEDIT_HDR_TYPE_MAX (__PEDIT_HDR_TYPE_MAX - 1)
 
+enum pedit_cmd {
+   TCA_PEDIT_KEY_EX_CMD_SET = 0,
+   TCA_PEDIT_KEY_EX_CMD_ADD = 1,
+   __PEDIT_CMD_MAX,
+};
+#define TCA_PEDIT_CMD_MAX (__PEDIT_CMD_MAX - 1)
+
 struct tc_pedit_key {
__u32   mask;  /* AND */
__u32   val;   /*XOR */
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index fdd012bd3602..c1310472f620 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -36,6 +36,7 @@ static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 
1] = {
 
 static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = 
{
[TCA_PEDIT_KEY_EX_HTYPE]  = { .type = NLA_U16 },
+   [TCA_PEDIT_KEY_EX_CMD]= { .type = NLA_U16 },
 };
 
 static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
@@ -75,14 +76,17 @@ static struct tcf_pedit_key_ex 
*tcf_pedit_keys_ex_parse(struct nlattr *nla,
if (err)
goto err_out;
 
-   if (!tb[TCA_PEDIT_KEY_EX_HTYPE]) {
+   if (!tb[TCA_PEDIT_KEY_EX_HTYPE] ||
+   !tb[TCA_PEDIT_KEY_EX_CMD]) {
err = -EINVAL;
goto err_out;
}
 
k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]);
+   k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]);
 
-   if (k->htype > TCA_PEDIT_HDR_TYPE_MAX) {
+   if (k->htype > TCA_PEDIT_HDR_TYPE_MAX ||
+   k->cmd > TCA_PEDIT_CMD_MAX) {
err = -EINVAL;
goto err_out;
}
@@ -110,7 +114,8 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
 
key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX);
 
-   if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype)) {
+   if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) ||
+   nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) {
nlmsg_trim(skb, keys_start);
return -EINVAL;
}
@@ -280,15 +285,19 @@ static int tcf_pedit(struct sk_buff *skb, const struct 
tc_action *a,
struct tc_pedit_key *tkey = p->tcfp_keys;
struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
enum pedit_header_type htype = 
TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
+   enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
 
for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
u32 *ptr, _data;
int offset = tkey->off;
int hoffset;
+   u32 val;
int rc;
 
if (tkey_ex) {
htype = tkey_ex->htype;
+   cmd = tkey_ex->cmd;
+
tkey_ex++;
}
 
@@ -330,7 +339,20 @@ static int tcf_pedit(struct sk_buff *skb, const struct 
tc_action *a,
if (!ptr)
goto bad;
/* just do it, baby */
-   *ptr = ((*ptr & tkey->mask) ^ tkey->val);
+   switch (cmd) {
+   case TCA_PEDIT_KEY_EX_CMD_SET:
+   val =

Re: [PATCH iproute2/net-next 0/7] tc: flower: Masked ICMP match and ND match

2017-02-06 Thread Simon Horman

On Mon, Feb 6, 2017 at 11:28 PM, Stephen Hemminger
 wrote:
> On Thu,  2 Feb 2017 11:38:33 +0100
> Simon Horman  wrote:
>
>> Hi,
>>
>> this series have several related parts.
>>
>> * tc: flower: Update documentation to indicate ARP takes IPv4 prefixes
>>
>>   Enhance documentation for consistency with later documentation changes.
>>
>> * tc: flower: use correct type when calling flower_icmp_attr_type
>>
>>   Type correction to ICMP code; should not have runtime effect
>>
>> * tc: flower: provide generic masked u8 parser helper
>>   tc: flower: provide generic masked u8 print helper
>>
>>   Generic parsing and printing of masked u8 options
>>
>> * tc: flower: support masked ICMP code and type match
>>
>>   Support masking ICMP code and type matches.
>>   Unmasked matching is already supported by iproute2
>>   Masked matching is already supported by the kernel.
>>
>>   This is used by the ND patches
>>
>> * tc: flower: Add TCA_FLOWER_KEY_ND_*
>>   tc: flower: Support matching on ND
>>
>> The last two patches are marked as RFC as they support functionality
>> submitted to but not yet not yet present in the kernel.
>>
>>
>> Simon Horman (7):
>>   tc: flower: Update documentation to indicate ARP takes IPv4 prefixes
>>   tc: flower: use correct type when calling flower_icmp_attr_type
>>   tc: flower: provide generic masked u8 parser helper
>>   tc: flower: provide generic masked u8 print helper
>>   tc: flower: support masked ICMP code and type match
>>   tc: flower: Add TCA_FLOWER_KEY_ND_*
>>   tc: flower: Support matching on ND
>>
>>  include/linux/pkt_cls.h |   7 ++
>>  man/man8/tc-flower.8|  58 +--
>>  tc/f_flower.c   | 260 
>> +---
>>  3 files changed, 258 insertions(+), 67 deletions(-)
>>
>
> Since this patchset depended on changes to pkt_cls.h which are not accepted 
> upstream
> into net-next, I marked it as awaiting upstream.  When the corresponding 
> kernel changes
> are accepted into net-next please resubmit it.

Hi Stephen,

I think that the first 5 patches can be considered independently of
the last two which have the dependency you describe. Sorry for not
making this clearer; e.g. by posting them separately. Would it help if
I reposted the first five patches?

[PATCH] vhost: try avoiding avail index access when getting descriptor

2017-02-06 Thread Jason Wang

If last avail idx is not equal to cached avail idx, we're sure there's
still available buffers in the virtqueue so there's no need to re-read
avail idx. So let's skip this to avoid unnecessary userspace memory
access and memory barrier. Pktgen test show about 3% improvement on rx
pps.

Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 39 +++
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 9f11838..bffbeeb 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1932,25 +1932,32 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
/* Check it isn't doing very strange things with descriptor numbers. */
last_avail_idx = vq->last_avail_idx;
-   if (unlikely(vhost_get_user(vq, avail_idx, >avail->idx))) {
-   vq_err(vq, "Failed to access avail idx at %p\n",
-  >avail->idx);
-   return -EFAULT;
-   }
-   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
 
-   if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
-   vq_err(vq, "Guest moved used index from %u to %u",
-  last_avail_idx, vq->avail_idx);
-   return -EFAULT;
-   }
+   if (vq->avail_idx == vq->last_avail_idx) {
+   if (unlikely(vhost_get_user(vq, avail_idx, >avail->idx))) {
+   vq_err(vq, "Failed to access avail idx at %p\n",
+   >avail->idx);
+   return -EFAULT;
+   }
+   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
 
-   /* If there's nothing new since last we looked, return invalid. */
-   if (vq->avail_idx == last_avail_idx)
-   return vq->num;
+   if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
+   vq_err(vq, "Guest moved used index from %u to %u",
+   last_avail_idx, vq->avail_idx);
+   return -EFAULT;
+   }
+
+   /* If there's nothing new since last we looked, return
+* invalid.
+*/
+   if (vq->avail_idx == last_avail_idx)
+   return vq->num;
 
-   /* Only get avail ring entries after they have been exposed by guest. */
-   smp_rmb();
+   /* Only get avail ring entries after they have been
+* exposed by guest.
+*/
+   smp_rmb();
+   }
 
/* Grab the next descriptor number they're advertising, and increment
 * the index we've seen. */
-- 
2.7.4

[patch v2 net-next] sfc: fix an off by one bug

2017-02-06 Thread Dan Carpenter

This bug is harmless because it's just a sanity check and we always
pass valid values for "encap_type" but the test is off by one.

Fixes: 9b4108012517 ("sfc: insert catch-all filters for encapsulated traffic")
Signed-off-by: Dan Carpenter 
---
v2: Rebased against last linux-next

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 0475f1831b92..dec0c8083ff3 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -5134,7 +5134,7 @@ static int efx_ef10_filter_insert_def(struct efx_nic *efx,
 
/* quick bounds check (BCAST result impossible) */
BUILD_BUG_ON(EFX_EF10_BCAST != 0);
-   if (encap_type > ARRAY_SIZE(map) || map[encap_type] == 0) {
+   if (encap_type >= ARRAY_SIZE(map) || map[encap_type] == 0) {
WARN_ON(1);
return -EINVAL;
}

Re: [PATCH net-next 6/7] openvswitch: Add force commit.

2017-02-06 Thread Joe Stringer

On 6 February 2017 at 09:08, Pravin Shelar  wrote:
> On Thu, Feb 2, 2017 at 5:10 PM, Jarno Rajahalme  wrote:
>> Stateful network admission policy may allow connections to one
>> direction and reject connections initiated in the other direction.
>> After policy change it is possible that for a new connection an
>> overlapping conntrack entry already exist, where the connection
>> original direction is opposed to the new connection's initial packet.
>>
>> Most importantly, conntrack state relating to the current packet gets
>> the "reply" designation based on whether the original direction tuple
>> or the reply direction tuple matched.  If this "directionality" is
>> wrong w.r.t. to the stateful network admission policy it may happen
>> that packets in neither direction are correctly admitted.
>>
> Why not have the check in all commit actions? I am not sure in which
> case user would not want forced commit considering this can cause
> packet admission issue?

Seems like this case has involved one direction of a connection being
handled by a flow that committed the connection. Then something has
changed and you end up with a flow handling the opposite direction,
committing the connection. What if the first flow wasn't actually
removed? Plausibly you could end up with constant ct entry churn as
the connection is recreated each time there is a packet from an
alternating direction. Having a separate flag may assist with respect
to shooting one's own foot..

Re: [PATCH net-next 7/7] openvswitch: Pack struct sw_flow_key.

2017-02-06 Thread Joe Stringer

On 2 February 2017 at 17:10, Jarno Rajahalme  wrote:
> struct sw_flow_key has two 16-bit holes. Move the most matched
> conntrack match fields there.  In some typical cases this reduces the
> size of the key that needs to be hashed into half and into one cache
> line.
>
> Signed-off-by: Jarno Rajahalme 

Looks like this misses the zeroing in ovs_nla_get_flow_metadata();
might want to double-check for any other memset/copies of the key->ct
field.

Re: [PATCH net-next 5/7] openvswitch: Add original direction conntrack tuple to sw_flow_key.

2017-02-06 Thread Joe Stringer

On 2 February 2017 at 17:10, Jarno Rajahalme  wrote:
> Add the fields of the conntrack original direction 5-tuple to struct
> sw_flow_key.  The new fields are initially zeroed, and are populated
> whenever a conntrack action is executed and either finds or generates
> a conntrack entry.  This means that these fields exist for all packets
> were not rejected by conntrack as untrackable.
>
> The original tuple fields in the sw_flow_key are filled from the
> original direction tuple of the conntrack entry relating to the
> current packet, or from the original direction tuple of the master
> conntrack entry, if the current conntrack entry has a master.
> Generally, expected connections of connections having an assigned
> helper (e.g., FTP), have a master conntrack entry.
>
> The main purpose of the new conntrack original tuple fields is to
> allow matching on them for policy decision purposes, with the premise
> that the admissibility of tracked connections reply packets (as well
> as original direction packets), and both direction packets of any
> related connections may be based on ACL rules applying to the master
> connection's original direction 5-tuple.  This also makes it easier to
> make policy decisions when the actual packet headers might have been
> transformed by NAT, as the original direction 5-tuple represents the
> packet headers before any such transformation.
>
> When using the original direction 5-tuple the admissibility of return
> and/or related packets need not be based on the mere existence of a
> conntrack entry, allowing separation of admission policy from the
> established conntrack state.  While existence of a conntrack entry is
> required for admission of the return or related packets, policy
> changes can render connections that were initially admitted to be
> rejected or dropped afterwards.  If the admission of the return and
> related packets was based on mere conntrack state (e.g., connection
> being in an established state), a policy change that would make the
> connection rejected or dropped would need to find and delete all
> conntrack entries affected by such a change.  When using the original
> direction 5-tuple matching the affected conntrack entries can be
> allowed to time out instead, as the established state of the
> connection would not need to be the basis for packet admission any
> more.
>
> It should be noted that the directionality of related connections may
> be the same or different than that of the master connection, and
> neither the original direction 5-tuple nor the conntrack state bits
> carry this information.  If needed, the directionality of the master
> connection can be stored in master's conntrack mark or labels, which
> are automatically inherited by the expected related connections.
>
> The fact that neither ARP not ND packets are trackable by conntrack
> allows mutual exclusion between ARP/ND and the new conntrack original
> tuple fields.  Hence, the IP addresses are overlaid in union with ARP
> and ND fields.  This allows the sw_flow_key to not grow much due to
> this patch, but it also means that we must be careful to never use the
> new key fields with ARP or ND packets.  ARP is easy to distinguish and
> keep mutually exclusive based on the ethernet type, but ND being an
> ICMPv6 protocol requires a bit more attention.
>
> Signed-off-by: Jarno Rajahalme 
> ---

OK, maybe we need to do something a bit more to handle the NATed
related connections to address the problem in patch 1.



> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
> index 738a4fa..1afe153 100644
> --- a/net/openvswitch/conntrack.c
> +++ b/net/openvswitch/conntrack.c
> @@ -155,6 +155,59 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, 
> u8 state,
> key->ct.zone = zone->id;
> key->ct.mark = ovs_ct_get_mark(ct);
> ovs_ct_get_labels(ct, >ct.labels);
> +
> +   /* Use the master if we have one. */
> +   if (ct && ct->master)
> +   ct = ct->master;

Perhaps:

if (!ct || sw_flow_key_is_nd(key) || !is_ip_any(key->eth.type)) {
/* zero everything */
return;
}

One of the things this helps us to avoid is having a comment in the
middle of an if statement.

Then afterwards,
if (ct->master)
ct = ct->master;

> +
> +   key->ct.orig_proto = 0;
> +   key->ct.orig_tp.src = 0;
> +   key->ct.orig_tp.dst = 0;
> +   if (key->eth.type == htons(ETH_P_IP)) {
> +   /* IP version must match. */
> +   if (ct && nf_ct_l3num(ct) == NFPROTO_IPV4) {

I don't quite understand how we could end up with a connection NFPROTO
that is mismatched with an IP version that we should handle here, but
if there are some legitimite cases perhaps we can pick them up and
handle them in the early exit condition above?

We can probably share a few more lines between IPv4 and IPv6 here.

> @@ -208,24 +261,54 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct 
>

RE: [PATCH net-next v3 1/2] qed: Add infrastructure for PTP support.

2017-02-06 Thread Mintz, Yuval

> On Mon, Feb 06, 2017 at 10:41:44AM +, Mintz, Yuval wrote:
> > Richard - we're planning on sending v4 with the existing algorithm
> > [but without iterating on 'val == 0']; If you have any suggestion for
> > improving this, please share it.
> 
> Sorry, haven't had the time to look at your issue.
> 
> BTW, you said you think the performance impact is small, but did you
> measure it on typical HW?

Can't say we've focused on performance testing for the feature,
but regular testing showed sane results.

Re: [PATCH v2 net] bpf: add bpf_sk_netns_id() helper

2017-02-06 Thread Alexei Starovoitov

On Mon, Feb 06, 2017 at 06:57:45PM -0800, Andy Lutomirski wrote:
> On Mon, Feb 6, 2017 at 5:42 PM, Alexei Starovoitov
>  wrote:
> > On Sat, Feb 04, 2017 at 08:17:57PM -0800, Andy Lutomirski wrote:
> >> On Sat, Feb 4, 2017 at 8:05 PM, Alexei Starovoitov
> >>  wrote:
> >> > On Sat, Feb 04, 2017 at 07:33:14PM -0800, Andy Lutomirski wrote:
> >> >> What does "bpf programs are global" mean?  I am genuinely unable to
> >> >> figure out what you mean.  Here are some example guesses of what you
> >> >> might mean:
> >> >>
> >> >>  - BPF programs are compiled independently of a namespace.  This is
> >> >> certainly true, but I don't think it matters.
> >> >>
> >> >>  - You want BPF programs to affect everything on the system.  But this
> >> >> doesn't seem right to be -- they only affect things in the relevant
> >> >> cgroup, so they're not global in that sense.
> >> >
> >> > All bpf program types are global in the sense that you can
> >> > make all of them to operate across all possible scopes and namespaces.
> >>
> >> I still don't understand what you mean here.  A seccomp program runs
> >> in the process that installs it and children -- it does not run in
> >> "all possible scopes".
> >
> > seccomp and classic bpf is different, since there is no concept
> > of the program there. cbpf is a stateless set of instructions
> > that belong to some entity like seccomp or socket. ebpf is stateful
> > and starts with the program, then hook and then scope.
> 
> So... are you saying that a loaded eBPF object is global in the sense
> that if you attach the same object to more than one thing (two
> sockets, say), the *same* program runs and shares its state?  If so, I
> agree, but that's still not an argument that the *same* attachment of
> an eBPF program to a cgroup should run in multiple network namespaces.
> You could also attach the (same) program once per netns and its state
> would be shared.
> 
> I'm pretty sure I've never suggested that an eBPF program be bound to
> a namespace.  I just think that a lot of things get more
> straightforward if an *attachment* of an eBPF program to a cgroup is
> bound to a single namespace.

Thank you for this whole discussion over the last few months.
Frankly in the beginning I wasn't 100% sure about the api we picked,
now I'm completely convinced that we absolutely made the right choice.
It's clean and keeps scoping constructs clear and explicit for the users.
So I am proposing that in the future we should add the ability to
scope bpf programs by netns. Just like current api scopes them
by cgroup. The attachment must be explicit.
Current api attaches type_cgroup_* program to a hook and scopes it
by a given cgroup. At that time some apps within that cgroup may
already run in different netns and attaching process may be
in yet another netns. There is no way to have sane semantics
without explicitly specifying the scope. Currently we do it
by explicitly specifying the cgroup. In the future we need
to extend it by specifying netns (without cgroup). Then
for container technologies that are based on netns we'll
have an efficient way of scoping programs to given netns.
And when afnetns is finally ready, the same scoping approach
will work for afnetns as well. For cases that need to
have two scopes at the same time (like cgroup and netns)
the bpf_sk_netns_id() helper will work one way and
some future bpf_sk_cgroup_id() helper will work from
the other side.
So far in multi-scope cases one dimension is dominating.
Like number of cgroups is way larger than number of netns,
so explicit bpf_sk_netns_id() from inside the programs
is faster than doing the same in the kernel.
And if in the future there will be a case with a lot of
cgroups and a lot of netns at the same time, we'll extend
the api further to specify two scopes to bpf_prog_attach command.
The kernel side will probably need a hashtable to lookup
a bpf prog based on (cgroup, netns) pair of pointers.

> >> A socket filter runs on a single socket and
> >> therefore runs in a single netns.  So presumably I'm still
> >> misunderstanding you
> >
> > in classic - yes. ebpf can have the same program attached to
> > multiple sockets in different netns.
> > For classic - the object is the socket and the user can only
> > manipulate that socket. For extended - the object is the program
> > and it can exist on its own. Like the program can be pinned in bpffs
> > while it's not attached to any hook.
> > For classic bpf the ideas of CRIU naturally apply, since
> > it checkpoints the socket and it happens that socket has
> > a set of statless cbpf instructions within. So it's
> > expected to save/restore cbpf as part of socket save/restore.
> > In case of ebpf the program exists independently of the socket.
> 
> True.
> 
> > Doing save/restore of the ebpf program attached to a socket
> > is meaningless, since it could be pinned in bpffs, attached
> > to other sockets, has state in bpf maps,

Re: [PATCH net-next v3 1/2] qed: Add infrastructure for PTP support.

2017-02-06 Thread Richard Cochran

On Mon, Feb 06, 2017 at 10:41:44AM +, Mintz, Yuval wrote:
> Richard - we're planning on sending v4 with the existing algorithm
> [but without iterating on 'val == 0']; If you have any suggestion for
> improving this, please share it.

Sorry, haven't had the time to look at your issue.

BTW, you said you think the performance impact is small, but did you
measure it on typical HW?

Thanks,
Richard

linux-next: manual merge of the akpm tree with the net-next tree

2017-02-06 Thread Stephen Rothwell

Hi Andrew,

Today's linux-next merge of the akpm tree got a conflict in:

  drivers/net/can/flexcan.c

between commits:

  9eb7aa891101 ("can: flexcan: add quirk FLEXCAN_QUIRK_ENABLE_EACEN_RRS")
  b3cf53e988ce ("can: flexcan: add support for timestamp based rx-offload")

from the net-next tree and commit:

  "scripts/spelling.txt: add "disble(d)" pattern and fix typo instances"

from the akpm tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/net/can/flexcan.c
index ea57fed375c6,4dc71bce525c..
--- a/drivers/net/can/flexcan.c
+++ b/drivers/net/can/flexcan.c
@@@ -195,9 -189,7 +195,9 @@@
   */
  #define FLEXCAN_QUIRK_BROKEN_ERR_STATEBIT(1) /* [TR]WRN_INT not 
connected */
  #define FLEXCAN_QUIRK_DISABLE_RXFGBIT(2) /* Disable RX FIFO Global mask */
 -#define FLEXCAN_QUIRK_DISABLE_MECRBIT(3) /* Disable Memory error 
detection */
 +#define FLEXCAN_QUIRK_ENABLE_EACEN_RRSBIT(3) /* Enable EACEN and RRS 
bit in ctrl2 */
- #define FLEXCAN_QUIRK_DISABLE_MECRBIT(4) /* Disble Memory error detection 
*/
++#define FLEXCAN_QUIRK_DISABLE_MECRBIT(4) /* Disbale Memory error 
detection */
 +#define FLEXCAN_QUIRK_USE_OFF_TIMESTAMP   BIT(5) /* Use timestamp based 
offloading */
  
  /* Structure of the message buffer */
  struct flexcan_mb {

[PATCH v3 1/3] Documentation: devicetree: Add PHY no lane swap binding

2017-02-06 Thread Lukasz Majewski

Add the documentation to avoid PHY lane swapping. This is a boolean
entry to notify the phy device drivers that the TX/RX lanes NO need
to be swapped.
The use case for this binding mostly happens after wrong HW
configuration of PHY IC during bootstrap.

Signed-off-by: Lukasz Majewski 

---
Changes for v3:
- Change binding to property
---
 Documentation/devicetree/bindings/net/phy.txt | 4 
 1 file changed, 4 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/phy.txt 
b/Documentation/devicetree/bindings/net/phy.txt
index fb5056b..b558576 100644
--- a/Documentation/devicetree/bindings/net/phy.txt
+++ b/Documentation/devicetree/bindings/net/phy.txt
@@ -39,6 +39,10 @@ Optional Properties:
 - enet-phy-lane-swap: If set, indicates the PHY will swap the TX/RX lanes to
   compensate for the board being designed with the lanes swapped.
 
+- enet-phy-lane-no-swap: If set, indicates that PHY will disable swap of the
+  TX/RX lanes. This property allows the PHY to work correcly after e.g. wrong
+  bootstrap configuration caused by issues in PCB layout design.
+
 - eee-broken-100tx:
 - eee-broken-1000t:
 - eee-broken-10gt:
-- 
2.1.4

[PATCH] Make EN2 pin optional in the TRF7970A driver

2017-02-06 Thread Heiko Schocher

From: Guan Ben 

Make the EN2 pin optional. This is useful for boards,
which have this pin fix wired, for example to ground.

Signed-off-by: Guan Ben 
Signed-off-by: Mark Jonas 
Signed-off-by: Heiko Schocher 

---

 .../devicetree/bindings/net/nfc/trf7970a.txt   |  4 ++--
 drivers/nfc/trf7970a.c | 26 --
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/nfc/trf7970a.txt 
b/Documentation/devicetree/bindings/net/nfc/trf7970a.txt
index 32b35a0..5889a3d 100644
--- a/Documentation/devicetree/bindings/net/nfc/trf7970a.txt
+++ b/Documentation/devicetree/bindings/net/nfc/trf7970a.txt
@@ -5,8 +5,8 @@ Required properties:
 - spi-max-frequency: Maximum SPI frequency (<= 200).
 - interrupt-parent: phandle of parent interrupt handler.
 - interrupts: A single interrupt specifier.
-- ti,enable-gpios: Two GPIO entries used for 'EN' and 'EN2' pins on the
-  TRF7970A.
+- ti,enable-gpios: One or two GPIO entries used for 'EN' and 'EN2' pins on the
+  TRF7970A. EN2 is optional.
 - vin-supply: Regulator for supply voltage to VIN pin
 
 Optional SoC Specific Properties:
diff --git a/drivers/nfc/trf7970a.c b/drivers/nfc/trf7970a.c
index 26c9dbb..75079fb 100644
--- a/drivers/nfc/trf7970a.c
+++ b/drivers/nfc/trf7970a.c
@@ -1885,8 +1885,10 @@ static int trf7970a_power_up(struct trf7970a *trf)
usleep_range(5000, 6000);
 
if (!(trf->quirks & TRF7970A_QUIRK_EN2_MUST_STAY_LOW)) {
-   gpio_set_value(trf->en2_gpio, 1);
-   usleep_range(1000, 2000);
+   if (gpio_is_valid(trf->en2_gpio)) {
+   gpio_set_value(trf->en2_gpio, 1);
+   usleep_range(1000, 2000);
+   }
}
 
gpio_set_value(trf->en_gpio, 1);
@@ -1914,7 +1916,8 @@ static int trf7970a_power_down(struct trf7970a *trf)
}
 
gpio_set_value(trf->en_gpio, 0);
-   gpio_set_value(trf->en2_gpio, 0);
+   if (gpio_is_valid(trf->en2_gpio))
+   gpio_set_value(trf->en2_gpio, 0);
 
ret = regulator_disable(trf->regulator);
if (ret)
@@ -2032,15 +2035,14 @@ static int trf7970a_probe(struct spi_device *spi)
 
trf->en2_gpio = of_get_named_gpio(np, "ti,enable-gpios", 1);
if (!gpio_is_valid(trf->en2_gpio)) {
-   dev_err(trf->dev, "No EN2 GPIO property\n");
-   return trf->en2_gpio;
-   }
-
-   ret = devm_gpio_request_one(trf->dev, trf->en2_gpio,
-   GPIOF_DIR_OUT | GPIOF_INIT_LOW, "trf7970a EN2");
-   if (ret) {
-   dev_err(trf->dev, "Can't request EN2 GPIO: %d\n", ret);
-   return ret;
+   dev_info(trf->dev, "No EN2 GPIO property\n");
+   } else {
+   ret = devm_gpio_request_one(trf->dev, trf->en2_gpio,
+   GPIOF_DIR_OUT | GPIOF_INIT_LOW, "trf7970a EN2");
+   if (ret) {
+   dev_err(trf->dev, "Can't request EN2 GPIO: %d\n", ret);
+   return ret;
+   }
}
 
if (of_property_read_bool(np, "en2-rf-quirk"))
-- 
2.7.4

[PATCH v3 2/3] net: phy: dp83867: Add lane swapping support in the DP83867 TI's PHY driver

2017-02-06 Thread Lukasz Majewski

This patch adds support for enabling or disabling the lane swapping (called
"port mirroring" in PHY's CFG4 register) feature of the DP83867 TI's PHY
device.

One use case is when bootstrap configuration enables this feature (because
of e.g. LED_0 wrong wiring) so then one needs to disable it in software
(at u-boot/Linux).

Signed-off-by: Lukasz Majewski 
---
Changes for v3:
- Add "line swapping" to the patch description
- Add DP83867_PORT_MIRROING_KEEP enum for better code readability

Changes for v2:
- use "net-phy-lane-swap" and "net-phy-lane-no-swap" generic PHY properties.
  instead of TI specific one
---
 drivers/net/phy/dp83867.c | 38 ++
 1 file changed, 38 insertions(+)

diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c
index ca1b462..be6fa24 100644
--- a/drivers/net/phy/dp83867.c
+++ b/drivers/net/phy/dp83867.c
@@ -32,6 +32,7 @@
 #define DP83867_CFG3   0x1e
 
 /* Extended Registers */
+#define DP83867_CFG40x0031
 #define DP83867_RGMIICTL   0x0032
 #define DP83867_RGMIIDCTL  0x0086
 #define DP83867_IO_MUX_CFG 0x0170
@@ -70,11 +71,21 @@
 #define DP83867_IO_MUX_CFG_IO_IMPEDANCE_MAX0x0
 #define DP83867_IO_MUX_CFG_IO_IMPEDANCE_MIN0x1f
 
+/* CFG4 bits */
+#define DP83867_CFG4_PORT_MIRROR_EN  BIT(0)
+
+enum {
+   DP83867_PORT_MIRROING_KEEP,
+   DP83867_PORT_MIRROING_EN,
+   DP83867_PORT_MIRROING_DIS,
+};
+
 struct dp83867_private {
int rx_id_delay;
int tx_id_delay;
int fifo_depth;
int io_impedance;
+   int port_mirroring;
 };
 
 static int dp83867_ack_interrupt(struct phy_device *phydev)
@@ -111,6 +122,24 @@ static int dp83867_config_intr(struct phy_device *phydev)
return phy_write(phydev, MII_DP83867_MICR, micr_status);
 }
 
+static int dp83867_config_port_mirroring(struct phy_device *phydev)
+{
+   struct dp83867_private *dp83867 =
+   (struct dp83867_private *)phydev->priv;
+   u16 val;
+
+   val = phy_read_mmd_indirect(phydev, DP83867_CFG4, DP83867_DEVADDR);
+
+   if (dp83867->port_mirroring == DP83867_PORT_MIRROING_EN)
+   val |= DP83867_CFG4_PORT_MIRROR_EN;
+   else
+   val &= ~DP83867_CFG4_PORT_MIRROR_EN;
+
+   phy_write_mmd_indirect(phydev, DP83867_CFG4, DP83867_DEVADDR, val);
+
+   return 0;
+}
+
 #ifdef CONFIG_OF_MDIO
 static int dp83867_of_init(struct phy_device *phydev)
 {
@@ -144,6 +173,12 @@ static int dp83867_of_init(struct phy_device *phydev)
 phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID))
return ret;
 
+   if (of_property_read_bool(of_node, "enet-phy-lane-swap"))
+   dp83867->port_mirroring = DP83867_PORT_MIRROING_EN;
+
+   if (of_property_read_bool(of_node, "enet-phy-lane-no-swap"))
+   dp83867->port_mirroring = DP83867_PORT_MIRROING_DIS;
+
return of_property_read_u32(of_node, "ti,fifo-depth",
   >fifo_depth);
 }
@@ -228,6 +263,9 @@ static int dp83867_config_init(struct phy_device *phydev)
phy_write(phydev, DP83867_CFG3, val);
}
 
+   if (dp83867->port_mirroring != DP83867_PORT_MIRROING_KEEP)
+   dp83867_config_port_mirroring(phydev);
+
return 0;
 }
 
-- 
2.1.4

[PATCH v3 3/3] net: phy: dp83867: Recover from "port mirroring" N/A MODE4

2017-02-06 Thread Lukasz Majewski

The DP83867 when not properly bootstrapped - especially with LED_0 pin -
can enter N/A MODE4 for "port mirroring" feature.

To provide normal operation of the PHY, one needs not only to explicitly
disable the port mirroring feature, but as well stop some IC internal
testing (which disables RGMII communication).

To do that the STRAP_STS1 (0x006E) register must be read and RESERVED bit
11 examined. When it is set, the another RESERVED bit (11) at PHYCR
(0x0010) register must be clear to disable testing mode and enable RGMII
communication.

Thorough explanation of the problem can be found at following e2e thread:
"DP83867IR: Problem with RESERVED bits in PHY Control Register (PHYCR) -
Linux driver"

https://e2e.ti.com/support/interface/ethernet/f/903/p/571313/2096954#2096954

Signed-off-by: Lukasz Majewski 
---
Changes for v3:
- None
---
 drivers/net/phy/dp83867.c | 23 ++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c
index be6fa24..1986553 100644
--- a/drivers/net/phy/dp83867.c
+++ b/drivers/net/phy/dp83867.c
@@ -34,6 +34,7 @@
 /* Extended Registers */
 #define DP83867_CFG40x0031
 #define DP83867_RGMIICTL   0x0032
+#define DP83867_STRAP_STS1 0x006E
 #define DP83867_RGMIIDCTL  0x0086
 #define DP83867_IO_MUX_CFG 0x0170
 
@@ -58,9 +59,13 @@
 #define DP83867_RGMII_TX_CLK_DELAY_EN  BIT(1)
 #define DP83867_RGMII_RX_CLK_DELAY_EN  BIT(0)
 
+/* STRAP_STS1 bits */
+#define DP83867_STRAP_STS1_RESERVEDBIT(11)
+
 /* PHY CTRL bits */
 #define DP83867_PHYCR_FIFO_DEPTH_SHIFT 14
 #define DP83867_PHYCR_FIFO_DEPTH_MASK  (3 << 14)
+#define DP83867_PHYCR_RESERVED_MASKBIT(11)
 
 /* RGMIIDCTL bits */
 #define DP83867_RGMII_TX_CLK_DELAY_SHIFT   4
@@ -192,7 +197,7 @@ static int dp83867_of_init(struct phy_device *phydev)
 static int dp83867_config_init(struct phy_device *phydev)
 {
struct dp83867_private *dp83867;
-   int ret, val;
+   int ret, val, bs;
u16 delay;
 
if (!phydev->priv) {
@@ -215,6 +220,22 @@ static int dp83867_config_init(struct phy_device *phydev)
return val;
val &= ~DP83867_PHYCR_FIFO_DEPTH_MASK;
val |= (dp83867->fifo_depth << DP83867_PHYCR_FIFO_DEPTH_SHIFT);
+
+   /* The code below checks if "port mirroring" N/A MODE4 has been
+* enabled during power on bootstrap.
+*
+* Such N/A mode enabled by mistake can put PHY IC in some
+* internal testing mode and disable RGMII transmission.
+*
+* In this particular case one needs to check STRAP_STS1
+* register's bit 11 (marked as RESERVED).
+*/
+
+   bs = phy_read_mmd_indirect(phydev, DP83867_STRAP_STS1,
+  DP83867_DEVADDR);
+   if (bs & DP83867_STRAP_STS1_RESERVED)
+   val &= ~DP83867_PHYCR_RESERVED_MASK;
+
ret = phy_write(phydev, MII_DP83867_PHYCTRL, val);
if (ret)
return ret;
-- 
2.1.4

RE: [PATCH net-next/iproute 2/5] tc: bash-completion: Prepare action autocomplete to support several actions

2017-02-06 Thread Yotam Gigi

>-Original Message-
>From: Stephen Hemminger [mailto:step...@networkplumber.org]
>Sent: Tuesday, February 07, 2017 12:11 AM
>To: Yotam Gigi 
>Cc: netdev@vger.kernel.org; Elad Raz ; Ido Schimmel
>; Jiri Pirko ; j...@mojatatu.com;
>m...@mojatatu.com
>Subject: Re: [PATCH net-next/iproute 2/5] tc: bash-completion: Prepare action
>autocomplete to support several actions
>
>On Mon,  6 Feb 2017 15:19:21 +0200
>Yotam Gigi  wrote:
>
>> The action autocomplete routine (_tc_action_options) currently does not
>> support several actions statements in one tc command line as it uses the
>> _tc_once_attr and _tc_one_from_list.
>>
>> For example, in that case:
>>
>> $ tc filter add dev eth0 handle : u32 [...]  \
>> action sample group 5 rate 12 \
>> action sample 
>>
>> the _tc_once_attr function, when invoked with "group rate" will not
>> suggest those as they already exist on the command line.
>>
>> Fix the function to use the _from variant, thus allowing each action
>> autocomplete start from the action keyword, and not from the beginning of
>> the command line.
>>
>> Signed-off-by: Yotam Gigi 
>
>This patch does not apply cleanly to current iproute2 net-next tree.
>Please rebase and resubmit the whole series again.

Sorry for that. I will send it again.

Thanks!

>
>
>$ cat bash-completion/tc.rej
>--- bash-completion/tc
>+++ bash-completion/tc
>@@ -454,26 +454,28 @@ _tc_filter_options()
> # Returns 0 is completion should stop after running this function, 1 
> otherwise.
> _tc_action_options()
> {
>-case $1 in
>+local from=$1
>+local action=${words[from]}
>+case $action in
> bpf)
> _tc_bpf_options
> return 0
> ;;
> mirred)
>-_tc_one_of_list 'ingress egress'
>-_tc_one_of_list 'mirror redirect'
>-_tc_once_attr 'index dev'
>+_tc_one_of_list_from $from 'ingress egress'
>+_tc_one_of_list_from $from 'mirror redirect'
>+_tc_once_attr_from $from 'index dev'
> return 0
> ;;
> sample)
>-_tc_once_attr 'rate'
>-_tc_once_attr 'trunc'
>-_tc_once_attr 'group'
>+_tc_once_attr_from $from 'rate'
>+_tc_once_attr_from $from 'trunc'
>+_tc_once_attr_from $from 'group'
> return 0
> ;;
> gact)
>-_tc_one_of_list 'reclassify drop continue pass'
>-_tc_once_attr 'random'
>+_tc_one_of_list_from $from 'reclassify drop continue pass'
>+_tc_once_attr_from $from 'random'
> return 0
> ;;
> esac

Re: [net-next PATCH v2 0/5] XDP adjust head support for virtio

2017-02-06 Thread Michael S. Tsirkin

On Thu, Feb 02, 2017 at 07:14:05PM -0800, John Fastabend wrote:
> This series adds adjust head support for virtio. The following is my
> test setup. I use qemu + virtio as follows,
> 
> ./x86_64-softmmu/qemu-system-x86_64 \
>   -hda /var/lib/libvirt/images/Fedora-test0.img \
>   -m 4096  -enable-kvm -smp 2 -netdev tap,id=hn0,queues=4,vhost=on \
>   -device 
> virtio-net-pci,netdev=hn0,mq=on,guest_tso4=off,guest_tso6=off,guest_ecn=off,guest_ufo=off,vectors=9
> 
> In order to use XDP with virtio until LRO is supported TSO must be
> turned off in the host. The important fields in the above command line
> are the following,
> 
>   guest_tso4=off,guest_tso6=off,guest_ecn=off,guest_ufo=off
> 
> Also note it is possible to conusme more queues than can be supported
> because when XDP is enabled for retransmit XDP attempts to use a queue
> per cpu. My standard queue count is 'queues=4'.
> 
> After loading the VM I run the relevant XDP test programs in,
> 
>   ./sammples/bpf
> 
> For this series I tested xdp1, xdp2, and xdp_tx_iptunnel. I usually test
> with iperf (-d option to get bidirectional traffic), ping, and pktgen.
> I also have a modified xdp1 that returns XDP_PASS on any packet to ensure
> the normal traffic path to the stack continues to work with XDP loaded.
> 
> It would be great to automate this soon. At the moment I do it by hand
> which is starting to get tedious.
> 
> v2: original series dropped trace points after merge.

So I'd say ok, let's go ahead and merge this for now.

However, I came up with a new idea for the future and I'd like to show
where I'm going.  The idea is that we don't use s/g buffers on RX, so we
have a pointer per descriptor untapped.  So we can allow users to stick
their own pointer in there, if they promise not to use s/g on this vq.
With a full extra pointer to play with, we can go wild.

Take a look but it doesn't even build yet.
Need to roll it out to all devices etc.

--->

Signed-off-by: Michael S. Tsirkin 

--

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 409aeaa..b59e95e 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -263,6 +263,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
unsigned int out_sgs,
unsigned int in_sgs,
void *data,
+   void *ctx,
gfp_t gfp)
 {
struct vring_virtqueue *vq = to_vvq(_vq);
@@ -275,6 +276,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
START_USE(vq);
 
BUG_ON(data == NULL);
+   BUG_ON(ctx && vq->indirect);
 
if (unlikely(vq->broken)) {
END_USE(vq);
@@ -389,6 +391,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
vq->desc_state[head].data = data;
if (indirect)
vq->desc_state[head].indir_desc = desc;
+   if (ctx)
+   vq->desc_state[head].indir_desc = ctx;
 
/* Put entry in available array (but don't update avail->idx until they
 * do sync). */
@@ -461,7 +465,8 @@ int virtqueue_add_sgs(struct virtqueue *_vq,
for (sg = sgs[i]; sg; sg = sg_next(sg))
total_sg++;
}
-   return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, data, gfp);
+   return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs,
+data, NULL, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
 
@@ -483,7 +488,7 @@ int virtqueue_add_outbuf(struct virtqueue *vq,
 void *data,
 gfp_t gfp)
 {
-   return virtqueue_add(vq, , num, 1, 0, data, gfp);
+   return virtqueue_add(vq, , num, 1, 0, data, NULL, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
 
@@ -505,7 +510,31 @@ int virtqueue_add_inbuf(struct virtqueue *vq,
void *data,
gfp_t gfp)
 {
-   return virtqueue_add(vq, , num, 0, 1, data, gfp);
+   return virtqueue_add(vq, , num, 0, 1, data, NULL, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
+
+/**
+ * virtqueue_add_inbuf_ctx - expose input buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sg: scatterlist (must be well-formed and terminated!)
+ * @num: the number of entries in @sg writable by other side
+ * @data: the token identifying the buffer.
+ * @ctx: extra context for the token
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
+ */
+int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
+   struct scatterlist *sg, unsigned int num,
+   void *data,
+   void *ctx,
+   gfp_t gfp)
+{
+   return virtqueue_add(vq,

Re: [PATCH net-next 0/4] bridge: improve cache utilization

2017-02-06 Thread David Miller

From: Nikolay Aleksandrov 
Date: Sat,  4 Feb 2017 18:05:05 +0100

> This is the first set which begins to deal with the bad bridge cache
> access patterns. The first patch rearranges the bridge and port structs
> a little so the frequently (and closely) accessed members are in the same
> cache line. The second patch then moves the garbage collection to a
> workqueue trying to improve system responsiveness under load (many fdbs)
> and more importantly removes the need to check if the matched entry is
> expired in __br_fdb_get which was a major source of false-sharing.
> The third patch is a preparation for the final one which
> If properly configured, i.e. ports bound to CPUs (thus updating "updated"
> locally) then the bridge's HitM goes from 100% to 0%, but even without
> binding we get a win because previously every lookup that iterated over
> the hash chain caused false-sharing due to the first cache line being
> used for both mac/vid and used/updated fields.
> 
> Some results from tests I've run:
> (note that these were run in good conditions for the baseline, everything
>  ran on a single NUMA node and there were only 3 fdbs)
> 
> 1. baseline
> 100% Load HitM on the fdbs (between everyone who has done lookups and hit
> one of the 3 hash chains of the communicating
> src/dst fdbs)
> Overall 5.06% Load HitM for the bridge, first place in the list
> 
> 2. patched & ports bound to CPUs
> 0% Local load HitM, bridge is not even in the c2c report list
> Also there's 3% consistent improvement in netperf tests.

Looks great, series applied, thanks!

Re: [PATHv3 net-next] bpf: enable verifier to add 0 to packet ptr

2017-02-06 Thread David Miller

From: William Tu 
Date: Sat,  4 Feb 2017 08:37:29 -0800

> The patch fixes the case when adding a zero value to the packet
> pointer.  The zero value could come from src_reg equals type
> BPF_K or CONST_IMM.  The patch fixes both, otherwise the verifer
> reports the following error:
>   [...]
> R0=imm0,min_value=0,max_value=0
> R1=pkt(id=0,off=0,r=4)
> R2=pkt_end R3=fp-12
> R4=imm4,min_value=4,max_value=4
> R5=pkt(id=0,off=4,r=4)
>   269: (bf) r2 = r0 // r2 becomes imm0
>   270: (77) r2 >>= 3
>   271: (bf) r4 = r1 // r4 becomes pkt ptr
>   272: (0f) r4 += r2// r4 += 0
>   addition of negative constant to packet pointer is not allowed
> 
> Signed-off-by: William Tu 
> Signed-off-by: Mihai Budiu 
> Cc: Daniel Borkmann 
> Cc: Alexei Starovoitov 
> Acked-by: Daniel Borkmann 

Applied, thanks.

Re: [PATCH net 0/2] read vnet_hdr_sz once

2017-02-06 Thread David Miller

From: Willem de Bruijn 
Date: Fri,  3 Feb 2017 18:20:47 -0500

> Tuntap devices allow concurrent use and update of field vnet_hdr_sz.
> Read the field once to avoid TOCTOU.

Series applied and queued up for -stable, thanks.

Re: [PATCH net] tcp: avoid infinite loop in tcp_splice_read()

2017-02-06 Thread David Miller

From: Eric Dumazet 
Date: Fri, 03 Feb 2017 14:59:38 -0800

> From: Eric Dumazet 
> 
> Splicing from TCP socket is vulnerable when a packet with URG flag is
> received and stored into receive queue.
> 
> __tcp_splice_read() returns 0, and sk_wait_data() immediately
> returns since there is the problematic skb in queue.
> 
> This is a nice way to burn cpu (aka infinite loop) and trigger
> soft lockups.
> 
> Again, this gem was found by syzkaller tool.
> 
> Fixes: 9c55e01c0cc8 ("[TCP]: Splice receive support.")
> Signed-off-by: Eric Dumazet 
> Reported-by: Dmitry Vyukov  
> Cc: Willy Tarreau 

Applied and queued up for -stable, thanks Eric.

Re: [PATCH net-next][v2] bpf: test for AND edge cases

2017-02-06 Thread David Miller

From: Josef Bacik 
Date: Fri, 3 Feb 2017 16:25:23 -0500

> These two tests are based on the work done for f23cc643f9ba.  The first test 
> is
> just a basic one to make sure we don't allow AND'ing negative values, even if 
> it
> would result in a valid index for the array.  The second is a cleaned up 
> version
> of the original testcase provided by Jann Horn that resulted in the commit.
> 
> Acked-by: Alexei Starovoitov 
> Acked-by: Daniel Borkmann 
> Signed-off-by: Josef Bacik 
> ---
> v1->v2:
> -rebased onto net-next

Applied.

Re: [RFC] igmp: address pmc kmemleak from on igmpv3_del_delrec()

2017-02-06 Thread David Miller

From: Cong Wang 
Date: Mon, 6 Feb 2017 16:32:14 -0800

> On Fri, Feb 3, 2017 at 1:20 PM, Luis R. Rodriguez  wrote:
>> When we igmpv3_add_delrec() we kzalloc the pmc, but when users
>> calligmpv3_del_delrec() we never free the pmc. This was caught
>> by the following kmemleak splat:
>>
>> unreferenced object 0x99666ff43b40 (size 192):
>>   comm "systemd-resolve", pid 1258, jiffies 4309905600 (age 2138.352s)
>>   hex dump (first 32 bytes):
>> 00 6a 64 72 66 99 ff ff e0 00 00 fc 00 00 00 00  .jdrf...
>> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
>>   backtrace:
>> [] kmemleak_alloc+0x4a/0xa0
>> [] kmem_cache_alloc_trace+0x107/0x240
>> [] igmp_group_dropped+0xfd/0x270
>> [] ip_mc_dec_group+0xaf/0x110
>> [] ip_mc_leave_group+0xb6/0x140
>> [] do_ip_setsockopt.isra.13+0x4c7/0xed0
>> [] ip_setsockopt+0x34/0xb0
>> [] udp_setsockopt+0x1b/0x30
>> [] sock_common_setsockopt+0x14/0x20
>> [] SyS_setsockopt+0x80/0xe0
>> [] do_syscall_64+0x5b/0xc0
>> [] return_from_SYSCALL_64+0x0/0x6a
>> [] 0x
>>
>> Signed-off-by: Luis R. Rodriguez 
>> ---
>>
>> I can reproduce this over time on a qemu box running next-20170125.
>> After running this for a while I no longer see the splat. This needs
>> confirmation form folks more familiar with the code, hence RFC. If
>> this is a real fix we need appropriate tags for the patch.
> 
> 
> Looks good to me. Adding some people who recent touched it to CC.

The kfree() was removed in commit:


commit 24803f38a5c0b6c57ed800b47e695f9ce474bc3a
Author: Hangbin Liu 
Date:   Mon Nov 14 16:16:28 2016 +0800

igmp: do not remove igmp souce list info when set link down


But I don't see what could possibly continue to reference this
pmc after we unlink it from the list.

[PATCH net-next 1/9] mlx4: use __skb_fill_page_desc()

2017-02-06 Thread Eric Dumazet

Or we might miss the fact that a page was allocated from memory reserves.

Fixes: dceeab0e5258 ("mlx4: support __GFP_MEMALLOC for rx")
Signed-off-by: Eric Dumazet 
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 
f15ddba3659aac38471059c6bcbf05071794..03f1713c94c7fa57e9eaaf87fe38a0a6d372 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -601,10 +601,10 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv 
*priv,
dma_sync_single_for_cpu(priv->ddev, dma, frag_info->frag_size,
DMA_FROM_DEVICE);
 
-   /* Save page reference in skb */
-   __skb_frag_set_page(_frags_rx[nr], frags[nr].page);
-   skb_frag_size_set(_frags_rx[nr], frag_info->frag_size);
-   skb_frags_rx[nr].page_offset = frags[nr].page_offset;
+   __skb_fill_page_desc(skb, nr, frags[nr].page,
+frags[nr].page_offset,
+frag_info->frag_size);
+
skb->truesize += frag_info->frag_stride;
frags[nr].page = NULL;
}
-- 
2.11.0.483.g087da7b7c-goog

[PATCH net-next 7/9] mlx4: removal of frag_sizes[]

2017-02-06 Thread Eric Dumazet

We will soon use order-0 pages, and frag truesize will more precisely
match real sizes.

In the new model, we prefer to use <= 2048 bytes fragments, so that
we can use page-recycle technique on PAGE_SIZE=4096 arches.

We will still pack as much frames as possible on arches with big
pages, like PowerPC.

Signed-off-by: Eric Dumazet 
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 24 ++--
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |  8 
 2 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 
6c95694f6390aa9fbc5f941a97e305815949..dd3bfcfea10c4545dfeda0f999449b13ca91 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1178,13 +1178,6 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int 
budget)
return done;
 }
 
-static const int frag_sizes[] = {
-   FRAG_SZ0,
-   FRAG_SZ1,
-   FRAG_SZ2,
-   FRAG_SZ3
-};
-
 void mlx4_en_calc_rx_buf(struct net_device *dev)
 {
struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -1208,13 +1201,16 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
int buf_size = 0;
 
while (buf_size < eff_mtu) {
-   priv->frag_info[i].frag_size =
-   (eff_mtu > buf_size + frag_sizes[i]) ?
-   frag_sizes[i] : eff_mtu - buf_size;
-   priv->frag_info[i].frag_stride =
-   ALIGN(priv->frag_info[i].frag_size,
- SMP_CACHE_BYTES);
-   buf_size += priv->frag_info[i].frag_size;
+   int frag_size = eff_mtu - buf_size;
+
+   if (i < MLX4_EN_MAX_RX_FRAGS - 1)
+   frag_size = min(frag_size, 2048);
+
+   priv->frag_info[i].frag_size = frag_size;
+
+   priv->frag_info[i].frag_stride = ALIGN(frag_size,
+  SMP_CACHE_BYTES);
+   buf_size += frag_size;
i++;
}
priv->rx_page_order = MLX4_EN_ALLOC_PREFER_ORDER;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h 
b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 
c9916b75b94bc9364b2cbe6da06a5ea385c6..a5bb0103ad8fd3b3f4b3d16099b7bf7ba01b 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -104,14 +104,6 @@
 
 #define MLX4_EN_ALLOC_PREFER_ORDER PAGE_ALLOC_COSTLY_ORDER
 
-/* Receive fragment sizes; we use at most 3 fragments (for 9600 byte MTU
- * and 4K allocations) */
-enum {
-   FRAG_SZ0 = 1536 - NET_IP_ALIGN,
-   FRAG_SZ1 = 4096,
-   FRAG_SZ2 = 4096,
-   FRAG_SZ3 = MLX4_EN_ALLOC_SIZE
-};
 #define MLX4_EN_MAX_RX_FRAGS   4
 
 /* Maximum ring sizes */
-- 
2.11.0.483.g087da7b7c-goog

[PATCH net-next 9/9] mlx4: add page recycling in receive path

2017-02-06 Thread Eric Dumazet

Same technique than some Intel drivers, for arches where PAGE_SIZE = 4096

In most cases, pages are reused because they were consumed
before we could loop around the RX ring.

This brings back performance, and is even better,
a single TCP flow reaches 30Gbit on my hosts.

Signed-off-by: Eric Dumazet 
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 238 ---
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |   1 -
 2 files changed, 68 insertions(+), 171 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 
be4f3491a4fcb6ee0e9fe4e71abfd2bc5373..6854a19087edbf0bc9bf29e20a82deaaf043 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -50,10 +50,9 @@
 
 #include "mlx4_en.h"
 
-static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
-   struct mlx4_en_rx_alloc *page_alloc,
-   const struct mlx4_en_frag_info *frag_info,
-   gfp_t gfp)
+static int mlx4_alloc_page(const struct mlx4_en_priv *priv,
+  struct mlx4_en_rx_alloc *frag,
+  gfp_t gfp)
 {
struct page *page;
dma_addr_t dma;
@@ -66,142 +65,40 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
put_page(page);
return -ENOMEM;
}
-   page_alloc->page = page;
-   page_alloc->dma = dma;
-   page_alloc->page_offset = 0;
-   /* Not doing get_page() for each frag is a big win
-* on asymetric workloads. Note we can not use atomic_set().
-*/
-   page_ref_add(page, PAGE_SIZE / frag_info->frag_stride - 1);
+   frag->page = page;
+   frag->dma = dma;
+   frag->page_offset = priv->rx_headroom;
return 0;
 }
 
-static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
+static int mlx4_en_alloc_frags(const struct mlx4_en_priv *priv,
   struct mlx4_en_rx_desc *rx_desc,
   struct mlx4_en_rx_alloc *frags,
-  struct mlx4_en_rx_alloc *ring_alloc,
   gfp_t gfp)
 {
-   struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
-   const struct mlx4_en_frag_info *frag_info;
-   struct page *page;
int i;
 
-   for (i = 0; i < priv->num_frags; i++) {
-   frag_info = >frag_info[i];
-   page_alloc[i] = ring_alloc[i];
-   page_alloc[i].page_offset += frag_info->frag_stride;
-
-   if (page_alloc[i].page_offset + frag_info->frag_stride <=
-   PAGE_SIZE)
-   continue;
-
-   if (unlikely(mlx4_alloc_pages(priv, _alloc[i],
- frag_info, gfp)))
-   goto out;
-   }
-
-   for (i = 0; i < priv->num_frags; i++) {
-   frags[i] = ring_alloc[i];
-   frags[i].page_offset += priv->rx_headroom;
-   rx_desc->data[i].addr = cpu_to_be64(frags[i].dma +
-   frags[i].page_offset);
-   ring_alloc[i] = page_alloc[i];
-   }
-
-   return 0;
-
-out:
-   while (i--) {
-   if (page_alloc[i].page != ring_alloc[i].page) {
-   dma_unmap_page(priv->ddev, page_alloc[i].dma,
-  PAGE_SIZE, priv->dma_dir);
-   page = page_alloc[i].page;
-   /* Revert changes done by mlx4_alloc_pages */
-   page_ref_sub(page, PAGE_SIZE /
-  priv->frag_info[i].frag_stride - 1);
-   put_page(page);
-   }
-   }
-   return -ENOMEM;
-}
-
-static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
- struct mlx4_en_rx_alloc *frags,
- int i)
-{
-   const struct mlx4_en_frag_info *frag_info = >frag_info[i];
-   u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride;
-
-
-   if (next_frag_end > PAGE_SIZE)
-   dma_unmap_page(priv->ddev, frags[i].dma, PAGE_SIZE,
-  priv->dma_dir);
-
-   if (frags[i].page)
-   put_page(frags[i].page);
-}
-
-static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
- struct mlx4_en_rx_ring *ring)
-{
-   int i;
-   struct mlx4_en_rx_alloc *page_alloc;
-
-   for (i = 0; i < priv->num_frags; i++) {
-   const struct mlx4_en_frag_info *frag_info = >frag_info[i];
-
-   if (mlx4_alloc_pages(priv, >page_alloc[i],
-frag_info, GFP_KERNEL | __GFP_COLD))
-   goto out;
-
-   en_dbg(DRV, priv, "  frag %d allocator: - frags:%d\n",
-  i,

[PATCH net-next 8/9] mlx4: use order-0 pages for RX

2017-02-06 Thread Eric Dumazet

Use of order-3 pages is problematic in some cases.

This patch might add three kinds of regression :

1) a CPU performance regression, but we will add later page
recycling and performance should be back.

2) TCP receiver could grow its receive window slightly slower,
   because skb->len/skb->truesize ratio will decrease.
   This is mostly ok, we prefer being conservative to not risk OOM,
   and eventually tune TCP better in the future.
   This is consistent with other drivers using 2048 per ethernet frame.

3) Because we allocate one page per RX slot, we consume more
   memory for the ring buffers. XDP already had this constraint anyway.

Signed-off-by: Eric Dumazet 
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 72 +---
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |  4 --
 2 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 
dd3bfcfea10c4545dfeda0f999449b13ca91..be4f3491a4fcb6ee0e9fe4e71abfd2bc5373 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -53,38 +53,26 @@
 static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
struct mlx4_en_rx_alloc *page_alloc,
const struct mlx4_en_frag_info *frag_info,
-   gfp_t _gfp)
+   gfp_t gfp)
 {
-   int order;
struct page *page;
dma_addr_t dma;
 
-   for (order = priv->rx_page_order; ;) {
-   gfp_t gfp = _gfp;
-
-   if (order)
-   gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NOMEMALLOC;
-   page = alloc_pages(gfp, order);
-   if (likely(page))
-   break;
-   if (--order < 0 ||
-   ((PAGE_SIZE << order) < frag_info->frag_size))
-   return -ENOMEM;
-   }
-   dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE << order,
-  priv->dma_dir);
+   page = alloc_page(gfp);
+   if (unlikely(!page))
+   return -ENOMEM;
+   dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE, priv->dma_dir);
if (unlikely(dma_mapping_error(priv->ddev, dma))) {
put_page(page);
return -ENOMEM;
}
-   page_alloc->page_size = PAGE_SIZE << order;
page_alloc->page = page;
page_alloc->dma = dma;
page_alloc->page_offset = 0;
/* Not doing get_page() for each frag is a big win
 * on asymetric workloads. Note we can not use atomic_set().
 */
-   page_ref_add(page, page_alloc->page_size / frag_info->frag_stride - 1);
+   page_ref_add(page, PAGE_SIZE / frag_info->frag_stride - 1);
return 0;
 }
 
@@ -105,7 +93,7 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
page_alloc[i].page_offset += frag_info->frag_stride;
 
if (page_alloc[i].page_offset + frag_info->frag_stride <=
-   ring_alloc[i].page_size)
+   PAGE_SIZE)
continue;
 
if (unlikely(mlx4_alloc_pages(priv, _alloc[i],
@@ -127,11 +115,10 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
while (i--) {
if (page_alloc[i].page != ring_alloc[i].page) {
dma_unmap_page(priv->ddev, page_alloc[i].dma,
-   page_alloc[i].page_size,
-   priv->dma_dir);
+  PAGE_SIZE, priv->dma_dir);
page = page_alloc[i].page;
/* Revert changes done by mlx4_alloc_pages */
-   page_ref_sub(page, page_alloc[i].page_size /
+   page_ref_sub(page, PAGE_SIZE /
   priv->frag_info[i].frag_stride - 1);
put_page(page);
}
@@ -147,8 +134,8 @@ static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride;
 
 
-   if (next_frag_end > frags[i].page_size)
-   dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size,
+   if (next_frag_end > PAGE_SIZE)
+   dma_unmap_page(priv->ddev, frags[i].dma, PAGE_SIZE,
   priv->dma_dir);
 
if (frags[i].page)
@@ -168,9 +155,8 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
 frag_info, GFP_KERNEL | __GFP_COLD))
goto out;
 
-   en_dbg(DRV, priv, "  frag %d allocator: - size:%d frags:%d\n",
-  i, ring->page_alloc[i].page_size,
-  page_ref_count(ring->page_alloc[i].page));
+   en_dbg(DRV, priv, "  frag %d allocator: - frags:%d\n",
+

[PATCH net-next 6/9] mlx4: reduce rx ring page_cache size

2017-02-06 Thread Eric Dumazet

We only need to store the page and dma address.

Signed-off-by: Eric Dumazet 
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 17 ++---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c   |  2 --
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |  6 +-
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 
80bb3c15f7c169f7091eb4a8dc06804f98b6..6c95694f6390aa9fbc5f941a97e305815949 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -250,7 +250,10 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv 
*priv,
(index << priv->log_rx_info);
 
if (ring->page_cache.index > 0) {
-   frags[0] = ring->page_cache.buf[--ring->page_cache.index];
+   ring->page_cache.index--;
+   frags[0].page = 
ring->page_cache.buf[ring->page_cache.index].page;
+   frags[0].dma  = 
ring->page_cache.buf[ring->page_cache.index].dma;
+   frags[0].page_offset = XDP_PACKET_HEADROOM;
rx_desc->data[0].addr = cpu_to_be64(frags[0].dma +
frags[0].page_offset);
return 0;
@@ -534,7 +537,9 @@ bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring,
if (cache->index >= MLX4_EN_CACHE_SIZE)
return false;
 
-   cache->buf[cache->index++] = *frame;
+   cache->buf[cache->index].page = frame->page;
+   cache->buf[cache->index].dma = frame->dma;
+   cache->index++;
return true;
 }
 
@@ -564,11 +569,9 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
int i;
 
for (i = 0; i < ring->page_cache.index; i++) {
-   struct mlx4_en_rx_alloc *frame = >page_cache.buf[i];
-
-   dma_unmap_page(priv->ddev, frame->dma, frame->page_size,
-  priv->dma_dir);
-   put_page(frame->page);
+   dma_unmap_page(priv->ddev, ring->page_cache.buf[i].dma,
+  PAGE_SIZE, priv->dma_dir);
+   put_page(ring->page_cache.buf[i].page);
}
ring->page_cache.index = 0;
mlx4_en_free_rx_buf(priv, ring);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 
98bc67a7249b14f8857fe1fd6baa40ae3ec5..e0c5ffb3e3a6607456e1f191b0b8c8becfc7 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -354,8 +354,6 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
struct mlx4_en_rx_alloc frame = {
.page = tx_info->page,
.dma = tx_info->map0_dma,
-   .page_offset = XDP_PACKET_HEADROOM,
-   .page_size = PAGE_SIZE,
};
 
if (!mlx4_en_rx_recycle(ring->recycle_ring, )) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h 
b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 
5d65a60e93b7a2ae84312cd0f2d474a065d9..c9916b75b94bc9364b2cbe6da06a5ea385c6 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -267,9 +267,13 @@ struct mlx4_en_rx_alloc {
 };
 
 #define MLX4_EN_CACHE_SIZE (2 * NAPI_POLL_WEIGHT)
+
 struct mlx4_en_page_cache {
u32 index;
-   struct mlx4_en_rx_alloc buf[MLX4_EN_CACHE_SIZE];
+   struct {
+   struct page *page;
+   dma_addr_t  dma;
+   } buf[MLX4_EN_CACHE_SIZE];
 };
 
 struct mlx4_en_priv;
-- 
2.11.0.483.g087da7b7c-goog

[PATCH net-next 5/9] mlx4: rx_headroom is a per port attribute

2017-02-06 Thread Eric Dumazet

No need to duplicate it per RX queue / frags.

Signed-off-by: Eric Dumazet 
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 6 +++---
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 
c6c64ac1e25931fc172beb5c718ec3a799f6..80bb3c15f7c169f7091eb4a8dc06804f98b6 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -115,7 +115,7 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
 
for (i = 0; i < priv->num_frags; i++) {
frags[i] = ring_alloc[i];
-   frags[i].page_offset += priv->frag_info[i].rx_headroom;
+   frags[i].page_offset += priv->rx_headroom;
rx_desc->data[i].addr = cpu_to_be64(frags[i].dma +
frags[i].page_offset);
ring_alloc[i] = page_alloc[i];
@@ -1199,7 +1199,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
 */
priv->frag_info[0].frag_stride = PAGE_SIZE;
priv->dma_dir = PCI_DMA_BIDIRECTIONAL;
-   priv->frag_info[0].rx_headroom = XDP_PACKET_HEADROOM;
+   priv->rx_headroom = XDP_PACKET_HEADROOM;
i = 1;
} else {
int buf_size = 0;
@@ -1211,12 +1211,12 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
priv->frag_info[i].frag_stride =
ALIGN(priv->frag_info[i].frag_size,
  SMP_CACHE_BYTES);
-   priv->frag_info[i].rx_headroom = 0;
buf_size += priv->frag_info[i].frag_size;
i++;
}
priv->rx_page_order = MLX4_EN_ALLOC_PREFER_ORDER;
priv->dma_dir = PCI_DMA_FROMDEVICE;
+   priv->rx_headroom = 0;
}
 
priv->num_frags = i;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h 
b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 
fe8ed4e85e9645679cc37d0d30284b523689..5d65a60e93b7a2ae84312cd0f2d474a065d9 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -472,7 +472,6 @@ struct mlx4_en_mc_list {
 
 struct mlx4_en_frag_info {
u16 frag_size;
-   u16 rx_headroom;
u32 frag_stride;
 };
 
@@ -585,6 +584,7 @@ struct mlx4_en_priv {
u8 log_rx_info;
u8 dma_dir;
u8 rx_page_order;
+   u16 rx_headroom;
 
struct mlx4_en_tx_ring **tx_ring[MLX4_EN_NUM_TX_TYPES];
struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS];
-- 
2.11.0.483.g087da7b7c-goog

[PATCH net-next 3/9] mlx4: remove order field from mlx4_en_frag_info

2017-02-06 Thread Eric Dumazet

This is really a port attribute, no need to duplicate it per
RX queue and per frag.

Signed-off-by: Eric Dumazet 
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 6 +++---
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 
9bb22eb5bfcc3037e92d06cca75d514dd52e..f868cb330039f5730ab8f59eca451c3d5272 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -59,7 +59,7 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
struct page *page;
dma_addr_t dma;
 
-   for (order = frag_info->order; ;) {
+   for (order = priv->rx_page_order; ;) {
gfp_t gfp = _gfp;
 
if (order)
@@ -1192,7 +1192,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
 * This only works when num_frags == 1.
 */
if (priv->tx_ring_num[TX_XDP]) {
-   priv->frag_info[0].order = 0;
+   priv->rx_page_order = 0;
priv->frag_info[0].frag_size = eff_mtu;
priv->frag_info[0].frag_prefix_size = 0;
/* This will gain efficient xdp frame recycling at the
@@ -1206,7 +1206,6 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
int buf_size = 0;
 
while (buf_size < eff_mtu) {
-   priv->frag_info[i].order = MLX4_EN_ALLOC_PREFER_ORDER;
priv->frag_info[i].frag_size =
(eff_mtu > buf_size + frag_sizes[i]) ?
frag_sizes[i] : eff_mtu - buf_size;
@@ -1218,6 +1217,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
buf_size += priv->frag_info[i].frag_size;
i++;
}
+   priv->rx_page_order = MLX4_EN_ALLOC_PREFER_ORDER;
priv->dma_dir = PCI_DMA_FROMDEVICE;
}
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h 
b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 
549f88b9becd9f2dd96282a44f6d374f14a4..11898550f87c077f6687903790d329e4aa1e 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -474,7 +474,6 @@ struct mlx4_en_frag_info {
u16 frag_size;
u16 frag_prefix_size;
u32 frag_stride;
-   u16 order;
u16 rx_headroom;
 };
 
@@ -586,6 +585,7 @@ struct mlx4_en_priv {
u8 num_frags;
u8 log_rx_info;
u8 dma_dir;
+   u8 rx_page_order;
 
struct mlx4_en_tx_ring **tx_ring[MLX4_EN_NUM_TX_TYPES];
struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS];
-- 
2.11.0.483.g087da7b7c-goog

[PATCH net-next 2/9] mlx4: dma_dir is a mlx4_en_priv attribute

2017-02-06 Thread Eric Dumazet

No need to duplicate it for all queues and frags.

num_frags & log_rx_info become u8 to save space.
u8 accesses are a bit faster than u16 anyway.

Signed-off-by: Eric Dumazet 
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 16 
 drivers/net/ethernet/mellanox/mlx4/en_tx.c   |  2 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |  6 +++---
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 
03f1713c94c7fa57e9eaaf87fe38a0a6d372..9bb22eb5bfcc3037e92d06cca75d514dd52e 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -72,7 +72,7 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
return -ENOMEM;
}
dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE << order,
-  frag_info->dma_dir);
+  priv->dma_dir);
if (unlikely(dma_mapping_error(priv->ddev, dma))) {
put_page(page);
return -ENOMEM;
@@ -128,7 +128,7 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
if (page_alloc[i].page != ring_alloc[i].page) {
dma_unmap_page(priv->ddev, page_alloc[i].dma,
page_alloc[i].page_size,
-   priv->frag_info[i].dma_dir);
+   priv->dma_dir);
page = page_alloc[i].page;
/* Revert changes done by mlx4_alloc_pages */
page_ref_sub(page, page_alloc[i].page_size /
@@ -149,7 +149,7 @@ static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
 
if (next_frag_end > frags[i].page_size)
dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size,
-  frag_info->dma_dir);
+  priv->dma_dir);
 
if (frags[i].page)
put_page(frags[i].page);
@@ -181,7 +181,7 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
page_alloc = >page_alloc[i];
dma_unmap_page(priv->ddev, page_alloc->dma,
   page_alloc->page_size,
-  priv->frag_info[i].dma_dir);
+  priv->dma_dir);
page = page_alloc->page;
/* Revert changes done by mlx4_alloc_pages */
page_ref_sub(page, page_alloc->page_size /
@@ -206,7 +206,7 @@ static void mlx4_en_destroy_allocator(struct mlx4_en_priv 
*priv,
   i, page_count(page_alloc->page));
 
dma_unmap_page(priv->ddev, page_alloc->dma,
-   page_alloc->page_size, frag_info->dma_dir);
+   page_alloc->page_size, priv->dma_dir);
while (page_alloc->page_offset + frag_info->frag_stride <
   page_alloc->page_size) {
put_page(page_alloc->page);
@@ -567,7 +567,7 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
struct mlx4_en_rx_alloc *frame = >page_cache.buf[i];
 
dma_unmap_page(priv->ddev, frame->dma, frame->page_size,
-  priv->frag_info[0].dma_dir);
+  priv->dma_dir);
put_page(frame->page);
}
ring->page_cache.index = 0;
@@ -1199,7 +1199,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
 * expense of more costly truesize accounting
 */
priv->frag_info[0].frag_stride = PAGE_SIZE;
-   priv->frag_info[0].dma_dir = PCI_DMA_BIDIRECTIONAL;
+   priv->dma_dir = PCI_DMA_BIDIRECTIONAL;
priv->frag_info[0].rx_headroom = XDP_PACKET_HEADROOM;
i = 1;
} else {
@@ -1214,11 +1214,11 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
priv->frag_info[i].frag_stride =
ALIGN(priv->frag_info[i].frag_size,
  SMP_CACHE_BYTES);
-   priv->frag_info[i].dma_dir = PCI_DMA_FROMDEVICE;
priv->frag_info[i].rx_headroom = 0;
buf_size += priv->frag_info[i].frag_size;
i++;
}
+   priv->dma_dir = PCI_DMA_FROMDEVICE;
}
 
priv->num_frags = i;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 
3ed42199d3f1275f77560e92a430c0dde181..98bc67a7249b14f8857fe1fd6baa40ae3ec5 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -360,7 +360,7 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
 
if (!mlx4_en_rx_recycle(ring->recycle_ring, )) {

[PATCH net-next 4/9] mlx4: get rid of frag_prefix_size

2017-02-06 Thread Eric Dumazet

Using per frag storage for frag_prefix_size is really silly.

mlx4_en_complete_rx_desc() has all needed info already.

Signed-off-by: Eric Dumazet 
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 27 ---
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |  3 +--
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 
f868cb330039f5730ab8f59eca451c3d5272..c6c64ac1e25931fc172beb5c718ec3a799f6 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -585,15 +585,14 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv 
*priv,
int length)
 {
struct skb_frag_struct *skb_frags_rx = skb_shinfo(skb)->frags;
-   struct mlx4_en_frag_info *frag_info;
-   int nr;
+   struct mlx4_en_frag_info *frag_info = priv->frag_info;
+   int nr, frag_size;
dma_addr_t dma;
 
/* Collect used fragments while replacing them in the HW descriptors */
-   for (nr = 0; nr < priv->num_frags; nr++) {
-   frag_info = >frag_info[nr];
-   if (length <= frag_info->frag_prefix_size)
-   break;
+   for (nr = 0;;) {
+   frag_size = min_t(int, length, frag_info->frag_size);
+
if (unlikely(!frags[nr].page))
goto fail;
 
@@ -603,15 +602,16 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv 
*priv,
 
__skb_fill_page_desc(skb, nr, frags[nr].page,
 frags[nr].page_offset,
-frag_info->frag_size);
+frag_size);
 
skb->truesize += frag_info->frag_stride;
frags[nr].page = NULL;
+   nr++;
+   length -= frag_size;
+   if (!length)
+   break;
+   frag_info++;
}
-   /* Adjust size of last fragment to match actual length */
-   if (nr > 0)
-   skb_frag_size_set(_frags_rx[nr - 1],
-   length - priv->frag_info[nr - 1].frag_prefix_size);
return nr;
 
 fail:
@@ -1194,7 +1194,6 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
if (priv->tx_ring_num[TX_XDP]) {
priv->rx_page_order = 0;
priv->frag_info[0].frag_size = eff_mtu;
-   priv->frag_info[0].frag_prefix_size = 0;
/* This will gain efficient xdp frame recycling at the
 * expense of more costly truesize accounting
 */
@@ -1209,7 +1208,6 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
priv->frag_info[i].frag_size =
(eff_mtu > buf_size + frag_sizes[i]) ?
frag_sizes[i] : eff_mtu - buf_size;
-   priv->frag_info[i].frag_prefix_size = buf_size;
priv->frag_info[i].frag_stride =
ALIGN(priv->frag_info[i].frag_size,
  SMP_CACHE_BYTES);
@@ -1229,10 +1227,9 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
   eff_mtu, priv->num_frags);
for (i = 0; i < priv->num_frags; i++) {
en_err(priv,
-  "  frag:%d - size:%d prefix:%d stride:%d\n",
+  "  frag:%d - size:%d stride:%d\n",
   i,
   priv->frag_info[i].frag_size,
-  priv->frag_info[i].frag_prefix_size,
   priv->frag_info[i].frag_stride);
}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h 
b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 
11898550f87c077f6687903790d329e4aa1e..fe8ed4e85e9645679cc37d0d30284b523689 
100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -472,9 +472,8 @@ struct mlx4_en_mc_list {
 
 struct mlx4_en_frag_info {
u16 frag_size;
-   u16 frag_prefix_size;
-   u32 frag_stride;
u16 rx_headroom;
+   u32 frag_stride;
 };
 
 #ifdef CONFIG_MLX4_EN_DCB
-- 
2.11.0.483.g087da7b7c-goog

[PATCH net-next 0/9] mlx4: order-0 allocations and page recycling

2017-02-06 Thread Eric Dumazet

As mentioned half a year ago, we better switch mlx4 driver to order-0
allocations and page recycling.

This reduces vulnerability surface thanks to better skb->truesize tracking
and provides better performance in most cases.

Worth noting this patch series deletes more than 100 lines of code ;)

Eric Dumazet (9):
  mlx4: use __skb_fill_page_desc()
  mlx4: dma_dir is a mlx4_en_priv attribute
  mlx4: remove order field from mlx4_en_frag_info
  mlx4: get rid of frag_prefix_size
  mlx4: rx_headroom is a per port attribute
  mlx4: reduce rx ring page_cache size
  mlx4: removal of frag_sizes[]
  mlx4: use order-0 pages for RX
  mlx4: add page recycling in receive path

 drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 350 +--
 drivers/net/ethernet/mellanox/mlx4/en_tx.c   |   4 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |  28 +--
 3 files changed, 129 insertions(+), 253 deletions(-)

-- 
2.11.0.483.g087da7b7c-goog

Re: [PATCH v2 net] bpf: add bpf_sk_netns_id() helper

2017-02-06 Thread Andy Lutomirski

On Mon, Feb 6, 2017 at 5:42 PM, Alexei Starovoitov
 wrote:
> On Sat, Feb 04, 2017 at 08:17:57PM -0800, Andy Lutomirski wrote:
>> On Sat, Feb 4, 2017 at 8:05 PM, Alexei Starovoitov
>>  wrote:
>> > On Sat, Feb 04, 2017 at 07:33:14PM -0800, Andy Lutomirski wrote:
>> >> What does "bpf programs are global" mean?  I am genuinely unable to
>> >> figure out what you mean.  Here are some example guesses of what you
>> >> might mean:
>> >>
>> >>  - BPF programs are compiled independently of a namespace.  This is
>> >> certainly true, but I don't think it matters.
>> >>
>> >>  - You want BPF programs to affect everything on the system.  But this
>> >> doesn't seem right to be -- they only affect things in the relevant
>> >> cgroup, so they're not global in that sense.
>> >
>> > All bpf program types are global in the sense that you can
>> > make all of them to operate across all possible scopes and namespaces.
>>
>> I still don't understand what you mean here.  A seccomp program runs
>> in the process that installs it and children -- it does not run in
>> "all possible scopes".
>
> seccomp and classic bpf is different, since there is no concept
> of the program there. cbpf is a stateless set of instructions
> that belong to some entity like seccomp or socket. ebpf is stateful
> and starts with the program, then hook and then scope.

So... are you saying that a loaded eBPF object is global in the sense
that if you attach the same object to more than one thing (two
sockets, say), the *same* program runs and shares its state?  If so, I
agree, but that's still not an argument that the *same* attachment of
an eBPF program to a cgroup should run in multiple network namespaces.
You could also attach the (same) program once per netns and its state
would be shared.

I'm pretty sure I've never suggested that an eBPF program be bound to
a namespace.  I just think that a lot of things get more
straightforward if an *attachment* of an eBPF program to a cgroup is
bound to a single namespace.

>
>> A socket filter runs on a single socket and
>> therefore runs in a single netns.  So presumably I'm still
>> misunderstanding you
>
> in classic - yes. ebpf can have the same program attached to
> multiple sockets in different netns.
> For classic - the object is the socket and the user can only
> manipulate that socket. For extended - the object is the program
> and it can exist on its own. Like the program can be pinned in bpffs
> while it's not attached to any hook.
> For classic bpf the ideas of CRIU naturally apply, since
> it checkpoints the socket and it happens that socket has
> a set of statless cbpf instructions within. So it's
> expected to save/restore cbpf as part of socket save/restore.
> In case of ebpf the program exists independently of the socket.

True.

> Doing save/restore of the ebpf program attached to a socket
> is meaningless, since it could be pinned in bpffs, attached
> to other sockets, has state in bpf maps, some other process
> might be actively talking to that program and so on.
> ebpf is a graph of interconnected pieces. To criu such thing
> one really need to freeze the whole system, all of it processes,
> and stop the kernel. I don't see criu ever be applied to ebpf.

Not true, I think.  CRIU could figure out which eBPF program is
attached to a socket and snapshot the attachment and (separately) the
eBPF object.  If the eBPF object were to be attached to two sockets,
CRIU would notice and only snaptshot the eBPF program once.  I don't
see why the whole system would need to be snapshotted.

Obviously this code isn't written yet.  But if eBPF were to be widely
used for, say, seccomp, I bet the CRIU code would show up in short
order.

>
> Potentially we can also allow a combination of scopes where
> single bpf program is attached to a hook while scoped by
> both netns and cgroup at the same time.
> I see it as an extension to prog_attach command where
> user will specify two fds (one for cgroup and one for netns) to make
> given prog_fd program scoped by both.
> Nothing in the current api prevents such future extensions.

I think this might be sufficient for all usecases.  Facebook's use
case of monitoring everything (if I understood correctly) could be
addressed by just attaching the program to the cgroup once for each
namespace.  The benefit would be a good deal of simplicity: you could
relax the capable() call to ns_capable() in the future, you wouldn't
need this patch, and you wouldn't be creating the first major
non-netns-specific network hook in the kernel.

>
> And from the other thread:
>
>> I'm not saying that at all.  I'm saying that this use case sounds
>> valid, but maybe it could be solved differently.  Here are some ideas:
>
> I'm happy to discuss api extension ideas as long as I don't hear
> a week later that they should be done for 4.10
>
>> - Expose the actual cgroup (relative to the hooked cgroup) to the BPF
>> program.  Then you

Re: [PATCH 1/2] libceph: Remove unneeded stddef.h include

2017-02-06 Thread Stafford Horne

On Mon, Feb 06, 2017 at 04:16:55PM +, David Laight wrote:
> From: Stafford Horne
> > Sent: 05 February 2017 07:08
> > This was causing a build failure for openrisc when using musl and
> > gcc 5.4.0 since the file is not available in the toolchain.
> > 
> > It doesnt seem this is needed and removing it does not cause any build
> > warnings for me.
> 
> Hmmm... stddef.h is part of the SuS v2.
> Required to get definitions for NULL, offsetof(), ptrdiff_t and size_t.
> 
> So any system that is pretending to by unix-like ought to have one.

Hello,
I agree, and also I can see that musl and gcc both do provide it.  I am
not sure why its not getting included for kernel builds, I didnt look
into it much because on the other hand...

Linux also provides it in 'linux/stddef.h'.  In this case vchiq_shim.c
includes "vchiq_util.h".
  which includes "linux/string.h"
which includes "linux/stddef.h"

So the requirement seems satisfied.

I am fine to send a patch to change the include to  as I
mentioned in the cover letter.  But I dont think its really needed.

-Stafford

[PATCH v2 0/2] Fixes for sierra_net driver

2017-02-06 Thread Stefan Brüns

When trying to initiate a dual-stack (ipv4v6) connection, a MC7710, FW
version SWI9200X_03.05.24.00ap answers with an unsupported LSI. Add support
for this LSI.
Also the link_type should be ignored when going idle, otherwise the modem
is stuck in a bad link state.
Tested on MC7710, T-Mobile DE, APN internet.telekom, IPv4v6 PDP type. Both
IPv4 and IPv6 connections work.

v2: Do not overwrite protocol field in rx_fixup

Stefan Brüns (2):
  sierra_net: Add support for IPv6 and Dual-Stack Link Sense Indications
  sierra_net: Skip validating irrelevant fields for IDLE LSIs

 drivers/net/usb/sierra_net.c | 110 ---
 1 file changed, 71 insertions(+), 39 deletions(-)

-- 
2.11.0

[PATCH v2 2/2] sierra_net: Skip validating irrelevant fields for IDLE LSIs

2017-02-06 Thread Stefan Brüns

When the context is deactivated, the link_type is set to 0xff, which
triggers a warning message, and results in a wrong link status, as
the LSI is ignored.

Signed-off-by: Stefan Brüns 
---
 drivers/net/usb/sierra_net.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/usb/sierra_net.c b/drivers/net/usb/sierra_net.c
index 977afe74ed54..e9e14263ff0a 100644
--- a/drivers/net/usb/sierra_net.c
+++ b/drivers/net/usb/sierra_net.c
@@ -385,6 +385,13 @@ static int sierra_net_parse_lsi(struct usbnet *dev, char 
*data, int datalen)
return -1;
}
 
+   /* Validate the session state */
+   if (lsi->session_state == SIERRA_NET_SESSION_IDLE) {
+   netdev_err(dev->net, "Session idle, 0x%02x\n",
+  lsi->session_state);
+   return 0;
+   }
+
/* Validate the protocol  - only support UMTS for now */
if (lsi->protocol == SIERRA_NET_PROTOCOL_UMTS) {
struct lsi_umts_single *single = (struct lsi_umts_single *)lsi;
@@ -418,13 +425,6 @@ static int sierra_net_parse_lsi(struct usbnet *dev, char 
*data, int datalen)
return 0;
}
 
-   /* Validate the session state */
-   if (lsi->session_state == SIERRA_NET_SESSION_IDLE) {
-   netdev_err(dev->net, "Session idle, 0x%02x\n",
-   lsi->session_state);
-   return 0;
-   }
-
/* Set link_sense true */
return 1;
 }
-- 
2.11.0

[PATCH v2 1/2] sierra_net: Add support for IPv6 and Dual-Stack Link Sense Indications

2017-02-06 Thread Stefan Brüns

If a context is configured as dualstack ("IPv4v6"), the modem indicates
the context activation with a slightly different indication message.
The dual-stack indication omits the link_type (IPv4/v6) and adds
additional address fields.
IPv6 LSIs are identical to IPv4 LSIs, but have a different link type.

Signed-off-by: Stefan Brüns 
---
v2: Do not overwrite protocol field in rx_fixup

Example LSI LINK UP indication:

   00 ed 78 00 04 01 00 e9 0a 14 00 54 00 65 00 6c  ..xT.e.l
0010   00 65 00 6b 00 6f 00 6d 00 2e 00 64 00 65 48 03  .e.k.o.m...d.eH.
0020   c8 be d1 00 62 00 00 00 2c 80 f0 01 00 00 00 00  b...,...
0030   30 cb 04 4c 49 4e 4b 20 55 50 00 00 00 00 00 00  0..LINK UP..
0040   00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
0050   00 00 00 00 04 0a 23 38 db 10 2a 01 05 98 88 c0  ..#8..*.
0060   1f da 00 01 00 01 91 23 a8 f9 00 00 00 00 00 00  ...#
0070   00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
0080   00 04 0a 4a d2 d2 10 2a 01 05 98 07 ff 00 00 00  ...J...*
0090   10 00 74 02 10 02 10 04 0a 4a d2 d3 10 2a 01 05  ..t..J...*..
00a0   98 07 ff 00 00 00 10 00 74 02 10 02 11 00 00 00  t...
00b0   00 00 00 00 00 00 00 00 00 00 00 00 00 ff ff 00  
00c0   00 00 00 00 00 c3 50 04 00 00 00 00 10 fe 80 00  ..P.
00d0   00 00 00 00 00 00 00 00 00 00 00 00 05 00 00 00  
00e0   00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
00f0   00
---
 drivers/net/usb/sierra_net.c | 100 ---
 1 file changed, 66 insertions(+), 34 deletions(-)

diff --git a/drivers/net/usb/sierra_net.c b/drivers/net/usb/sierra_net.c
index a251588762ec..977afe74ed54 100644
--- a/drivers/net/usb/sierra_net.c
+++ b/drivers/net/usb/sierra_net.c
@@ -73,8 +73,6 @@ staticatomic_t iface_counter = ATOMIC_INIT(0);
 /* Private data structure */
 struct sierra_net_data {
 
-   u8 ethr_hdr_tmpl[ETH_HLEN]; /* ethernet header template for rx'd pkts */
-
u16 link_up;/* air link up or down */
u8 tx_hdr_template[4];  /* part of HIP hdr for tx'd packets */
 
@@ -122,6 +120,7 @@ struct param {
 
 /* LSI Protocol types */
 #define SIERRA_NET_PROTOCOL_UMTS  0x01
+#define SIERRA_NET_PROTOCOL_UMTS_DS   0x04
 /* LSI Coverage */
 #define SIERRA_NET_COVERAGE_NONE  0x00
 #define SIERRA_NET_COVERAGE_NOPACKET  0x01
@@ -129,7 +128,8 @@ struct param {
 /* LSI Session */
 #define SIERRA_NET_SESSION_IDLE   0x00
 /* LSI Link types */
-#define SIERRA_NET_AS_LINK_TYPE_IPv4  0x00
+#define SIERRA_NET_AS_LINK_TYPE_IPV4  0x00
+#define SIERRA_NET_AS_LINK_TYPE_IPV6  0x02
 
 struct lsi_umts {
u8 protocol;
@@ -137,9 +137,14 @@ struct lsi_umts {
__be16 length;
/* eventually use a union for the rest - assume umts for now */
u8 coverage;
-   u8 unused2[41];
+   u8 network_len; /* network name len */
+   u8 network[40]; /* network name (UCS2, bigendian) */
u8 session_state;
u8 unused3[33];
+} __packed;
+
+struct lsi_umts_single {
+   struct lsi_umts lsi;
u8 link_type;
u8 pdp_addr_len; /* NW-supplied PDP address len */
u8 pdp_addr[16]; /* NW-supplied PDP address (bigendian)) */
@@ -158,10 +163,31 @@ struct lsi_umts {
u8 reserved[8];
 } __packed;
 
+struct lsi_umts_dual {
+   struct lsi_umts lsi;
+   u8 pdp_addr4_len; /* NW-supplied PDP IPv4 address len */
+   u8 pdp_addr4[4];  /* NW-supplied PDP IPv4 address (bigendian)) */
+   u8 pdp_addr6_len; /* NW-supplied PDP IPv6 address len */
+   u8 pdp_addr6[16]; /* NW-supplied PDP IPv6 address (bigendian)) */
+   u8 unused4[23];
+   u8 dns1_addr4_len; /* NW-supplied 1st DNS v4 address len (bigendian) */
+   u8 dns1_addr4[4];  /* NW-supplied 1st DNS v4 address */
+   u8 dns1_addr6_len; /* NW-supplied 1st DNS v6 address len */
+   u8 dns1_addr6[16]; /* NW-supplied 1st DNS v6 address (bigendian)*/
+   u8 dns2_addr4_len; /* NW-supplied 2nd DNS v4 address len (bigendian) */
+   u8 dns2_addr4[4];  /* NW-supplied 2nd DNS v4 address */
+   u8 dns2_addr6_len; /* NW-supplied 2nd DNS v6 address len */
+   u8 dns2_addr6[16]; /* NW-supplied 2nd DNS v6 address (bigendian)*/
+   u8 unused5[68];
+} __packed;
+
 #define SIERRA_NET_LSI_COMMON_LEN  4
-#define SIERRA_NET_LSI_UMTS_LEN(sizeof(struct lsi_umts))
+#define SIERRA_NET_LSI_UMTS_LEN(sizeof(struct lsi_umts_single))
 #define SIERRA_NET_LSI_UMTS_STATUS_LEN \
(SIERRA_NET_LSI_UMTS_LEN - SIERRA_NET_LSI_COMMON_LEN)
+#define SIERRA_NET_LSI_UMTS_DS_LEN (sizeof(struct lsi_umts_dual))
+#define SIERRA_NET_LSI_UMTS_DS_STATUS_LEN \
+   (SIERRA_NET_LSI_UMTS_DS_LEN - SIERRA_NET_LSI_COMMON_LEN)
 
 /* Forward definitions */
 static void sierra_sync_timer(unsigned long syncdata);
@@ -191,10 +217,11 @@ static inline void sierra_net_set_private(struct

Re: [net-next PATCH v2 5/5] virtio_net: XDP support for adjust_head

2017-02-06 Thread Jason Wang




On 2017年02月07日 03:29, John Fastabend wrote:

On 17-02-05 11:08 PM, Jason Wang wrote:


On 2017年02月03日 11:16, John Fastabend wrote:

Add support for XDP adjust head by allocating a 256B header region
that XDP programs can grow into. This is only enabled when a XDP
program is loaded.

In order to ensure that we do not have to unwind queue headroom push
queue setup below bpf_prog_add. It reads better to do a prog ref
unwind vs another queue setup call.

At the moment this code must do a full reset to ensure old buffers
without headroom on program add or with headroom on program removal
are not used incorrectly in the datapath. Ideally we would only
have to disable/enable the RX queues being updated but there is no
API to do this at the moment in virtio so use the big hammer. In
practice it is likely not that big of a problem as this will only
happen when XDP is enabled/disabled changing programs does not
require the reset. There is some risk that the driver may either
have an allocation failure or for some reason fail to correctly
negotiate with the underlying backend in this case the driver will
be left uninitialized. I have not seen this ever happen on my test
systems and for what its worth this same failure case can occur
from probe and other contexts in virtio framework.

Signed-off-by: John Fastabend 
---


[...]


@@ -412,7 +418,6 @@ static struct sk_buff *receive_small(struct net_device *dev,
   struct bpf_prog *xdp_prog;
 len -= vi->hdr_len;
-skb_trim(skb, len);
 rcu_read_lock();
   xdp_prog = rcu_dereference(rq->xdp_prog);
@@ -424,12 +429,16 @@ static struct sk_buff *receive_small(struct net_device
*dev,
   if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
   goto err_xdp;
   -xdp.data = skb->data;
+xdp.data_hard_start = skb->data;
+xdp.data = skb->data + VIRTIO_XDP_HEADROOM;
   xdp.data_end = xdp.data + len;
   act = bpf_prog_run_xdp(xdp_prog, );
 switch (act) {
   case XDP_PASS:
+/* Recalculate length in case bpf program changed it */
+__skb_pull(skb, xdp.data - xdp.data_hard_start);

But skb->len were trimmed to len below which seems wrong.

I believe this is correct and it passes my basic iperf/ping tests.

When we are using small buffers with XDP, skb->data is pointing to the front
of the buffer. This space includes the XDP headroom. When we pass the skb up
to the stack we need to pull this off and point to the start of the data. But
there still is likely a bunch of room at the end of the buffer assuming the
packet is smaller than the buffer side.


+len = xdp.data_end - xdp.data;
   break;
   case XDP_TX:
   if (unlikely(!virtnet_xdp_xmit(vi, rq, , skb)))
@@ -446,6 +455,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
   }
   rcu_read_unlock();
   +skb_trim(skb, len);

So here we trim the packet to set the length to the actual payload size. The
'len' parameter passed into receive_small does not include the headroom so this
gives us the correct length of the payload.

Make sense?


Yes, you are right.




   return skb;
 err_xdp:

[...]


@@ -569,7 +580,7 @@ static struct sk_buff *receive_mergeable(struct net_device
*dev,
 page, offset, );
   if (!xdp_page)
   goto err_xdp;
-offset = 0;
+offset = VIRTIO_XDP_HEADROOM;
   } else {
   xdp_page = page;
   }
@@ -582,19 +593,30 @@ static struct sk_buff *receive_mergeable(struct
net_device *dev,
   if (unlikely(hdr->hdr.gso_type))
   goto err_xdp;
   +/* Allow consuming headroom but reserve enough space to push
+ * the descriptor on if we get an XDP_TX return code.
+ */
   data = page_address(xdp_page) + offset;
+xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;

Should be data - VIRTIO_XDP_HEADROOM I think?


If the XDP program does an adjust_head() and then a XDP_TX I want to ensure
we reserve enough headroom to push the header onto the buffer when the packet
is sent. So the additional hdr_len reserve here is intentional. Otherwise we
would need to detect this and do some type of linearize action.


I get the point.




   xdp.data = data + vi->hdr_len;
   xdp.data_end = xdp.data + (len - vi->hdr_len);
   act = bpf_prog_run_xdp(xdp_prog, );
 switch (act) {
   case XDP_PASS:
+/* recalculate offset to account for any header
+ * adjustments. Note other cases do not build an
+ * skb and avoid using offset
+ */
+offset = xdp.data -
+page_address(xdp_page) - vi->hdr_len;
+
   /* We can only create skb based on xdp_page. */
   if (unlikely(xdp_page != page)) {

[PATCH v4 net] bpf: add bpf_sk_netns_id() helper

2017-02-06 Thread Alexei Starovoitov

in cases where bpf programs are looking at sockets and packets
that belong to different netns, it could be useful to get an id
that uniquely identify a netns within the whole system.

Therefore introduce 'u64 bpf_sk_netns_id(sk);' helper. It returns
unique value that identifies netns of given socket or dev_net(skb->dev)
The upper 32-bits of the return value contain device id where namespace
filesystem resides and lower 32-bits contain inode number within that 
filesystem.
It's the same as
 struct stat st;
 stat("/proc/pid/ns/net", );
 return (st->st_dev << 32)  | st->st_ino;

For example to disallow raw sockets in all non-init netns
the bpf_type_cgroup_sock program can do:
if (sk->type == SOCK_RAW && bpf_sk_netns_id(sk) != 0x3f075)
  return 0;
where 0x3f075 comes from combination of st_dev and st_ino
of /proc/pid/ns/net

Note that all bpf programs types are global. The same socket filter
program can be attached to sockets in different netns,
just like cls_bpf can see ingress/egress packets of multiple
net_devices in different netns. The cgroup_bpf programs are
the most exposed to sockets and devices across netns,
but the need to identify netns applies to all.
For example, if bpf_type_cgroup_skb didn't exist the system wide
monitoring daemon could have used ld_preload mechanism and
attached the same program to see traffic from applications
across netns. Therefore make bpf_sk_netns_id() helper available
to all network related bpf program types.
For socket, cls_bpf and cgroup_skb programs this helper
can be considered a new feature, whereas for cgroup_sock
programs that modify sk->bound_dev_if (like 'ip vrf' does)
it's a bug fix, since 'ip vrf' needs to be netns aware.

Signed-off-by: Alexei Starovoitov 
Reviewed-by: David Ahern 
Tested-by: David Ahern 
---
Eric, I'v added proc_get_ns_devid_inum() to nsfs.c
right next to __ns_get_path(), so when it is time in the future
to make nsfs more namespace aware, it will be easy to adjust
both new_inode_pseudo(mnt->mnt_sb) line and proc_get_ns_devid_inum()
I thought about using ns->stashed, but it's obviously transient
inode and not usable. If later we decide to store dev_t into ns_common
it will be fine as well. We'll just change proc_get_ns_devid_inum()
without affecting user space.

Note that eBPF is 64-bit and sizeof(long)==8 in bpf program,
so no need to mess with old_encode_dev and old_valid_dev checks.
new_encode_dev() is enough.

v2->v3: build bot complained. s/static/static inline/. no other changes.
v3->v4: fixed fallthrough case. Thanks Daniel.
---
 fs/nsfs.c |  7 +++
 include/linux/proc_ns.h   |  3 ++-
 include/uapi/linux/bpf.h  | 14 +-
 net/core/filter.c | 45 ++-
 samples/bpf/bpf_helpers.h |  2 ++
 samples/bpf/sock_flags_kern.c |  2 ++
 samples/bpf/sockex1_kern.c|  2 ++
 7 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8c9fb29c6673..1a604bccef86 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -49,6 +49,13 @@ static void nsfs_evict(struct inode *inode)
ns->ops->put(ns);
 }
 
+u64 proc_get_ns_devid_inum(struct ns_common *ns)
+{
+   u64 dev = new_encode_dev(nsfs_mnt->mnt_sb->s_dev);
+
+   return (dev << 32) | ns->inum;
+}
+
 static void *__ns_get_path(struct path *path, struct ns_common *ns)
 {
struct vfsmount *mnt = nsfs_mnt;
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 12cb8bd81d2d..531c16105198 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -48,7 +48,7 @@ extern int pid_ns_prepare_proc(struct pid_namespace *ns);
 extern void pid_ns_release_proc(struct pid_namespace *ns);
 extern int proc_alloc_inum(unsigned int *pino);
 extern void proc_free_inum(unsigned int inum);
-
+extern u64 proc_get_ns_devid_inum(struct ns_common *ns);
 #else /* CONFIG_PROC_FS */
 
 static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
@@ -61,6 +61,7 @@ static inline int proc_alloc_inum(unsigned int *inum)
 }
 static inline void proc_free_inum(unsigned int inum) {}
 
+static inline u64 proc_get_ns_devid_inum(struct ns_common *ns) { return 0; }
 #endif /* CONFIG_PROC_FS */
 
 static inline int ns_alloc_inum(struct ns_common *ns)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0eb0e87dbe9f..e5b8cf16cbaf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -430,6 +430,17 @@ union bpf_attr {
  * @xdp_md: pointer to xdp_md
  * @delta: An positive/negative integer to be added to xdp_md.data
  * Return: 0 on success or negative on error
+ *
+ * u64 bpf_sk_netns_id(sk)
+ * Returns unique value that identifies netns of given socket or skb.
+ * The upper 32-bits of the return value contain device id where namespace
+ * filesystem resides and lower 32-bits contain inode number within
+ * that filesystem. It's the

Re: [PATCH v2 net] bpf: add bpf_sk_netns_id() helper

2017-02-06 Thread Alexei Starovoitov

On Sat, Feb 04, 2017 at 08:17:57PM -0800, Andy Lutomirski wrote:
> On Sat, Feb 4, 2017 at 8:05 PM, Alexei Starovoitov
>  wrote:
> > On Sat, Feb 04, 2017 at 07:33:14PM -0800, Andy Lutomirski wrote:
> >> On Sat, Feb 4, 2017 at 7:25 PM, Alexei Starovoitov
> >>  wrote:
> >> > On Sat, Feb 04, 2017 at 09:15:10AM -0800, Andy Lutomirski wrote:
> >> >> On Fri, Feb 3, 2017 at 5:22 PM, Alexei Starovoitov  wrote:
> >> >> > Note that all bpf programs types are global.
> >> >>
> >> >> I don't think this has a clear enough meaning to work with.  In
> >> >
> >> > Please clarify what you mean. The quoted part says
> >> > "bpf programs are global". What is not "clear enough" there?
> >>
> >> What does "bpf programs are global" mean?  I am genuinely unable to
> >> figure out what you mean.  Here are some example guesses of what you
> >> might mean:
> >>
> >>  - BPF programs are compiled independently of a namespace.  This is
> >> certainly true, but I don't think it matters.
> >>
> >>  - You want BPF programs to affect everything on the system.  But this
> >> doesn't seem right to be -- they only affect things in the relevant
> >> cgroup, so they're not global in that sense.
> >
> > All bpf program types are global in the sense that you can
> > make all of them to operate across all possible scopes and namespaces.
> 
> I still don't understand what you mean here.  A seccomp program runs
> in the process that installs it and children -- it does not run in
> "all possible scopes". 

seccomp and classic bpf is different, since there is no concept
of the program there. cbpf is a stateless set of instructions
that belong to some entity like seccomp or socket. ebpf is stateful
and starts with the program, then hook and then scope.

> A socket filter runs on a single socket and
> therefore runs in a single netns.  So presumably I'm still
> misunderstanding you

in classic - yes. ebpf can have the same program attached to
multiple sockets in different netns.
For classic - the object is the socket and the user can only
manipulate that socket. For extended - the object is the program
and it can exist on its own. Like the program can be pinned in bpffs
while it's not attached to any hook.
For classic bpf the ideas of CRIU naturally apply, since
it checkpoints the socket and it happens that socket has
a set of statless cbpf instructions within. So it's
expected to save/restore cbpf as part of socket save/restore.
In case of ebpf the program exists independently of the socket.
Doing save/restore of the ebpf program attached to a socket
is meaningless, since it could be pinned in bpffs, attached
to other sockets, has state in bpf maps, some other process
might be actively talking to that program and so on.
ebpf is a graph of interconnected pieces. To criu such thing
one really need to freeze the whole system, all of it processes,
and stop the kernel. I don't see criu ever be applied to ebpf.

> > cgroup only gives a scope for the program to run, but it's
> > not limited by it. The user can have the same program
> > attached to two or more different cgroups, so one program
> > will run across multiple cgroups.
> 
> Does this mean "BPF programs are compiled independently of a
> namespace?"  If so, I don't see why it's relevant at all.  Sure, you
> could compile a BPF program once and install it in two different
> scopes, but that doesn't mean that the kernel should *run* it globally
> in any sense.  Can you clarify?

if single program is attached to different sockets it exactly
means that the kernel should run it for different sockets :)

> >>  - The set of BPF program types and the verification rules are
> >> independent of cgroup and namespace.  This is true, but I don't think
> >> it matters.
> >
> > It matters. That's actually the key to understand. The loading part
> > verifies correctness for particular program type.
> > Afterwards the same program can be attached to any place.
> > Including different cgroups and different namespaces.
> > The 'attach' part is like 'switch on' that enables program
> > on particular hook. The scope (whether it's socket or netdev or cgroup)
> > is a scope that program author uses to narrow down the hook,
> > but it's not an ultimate restriction.
> > For example the socket program can be attached to sockets and
> > share information with cls_bpf program attached to netdev.
> > The kprobe tracing program can peek into kernel internal data
> > and share it with cls_bpf or any other type as long as
> > everything is root. The information flow is global to the whole system.
> 
> Why does any of this imply that a cgroup+bpf program that is attached
> once should run for all network namespaces?

because cgroup is the only scope that is being used to scope this
particular bpf_type_cgroup_sock program type.
In the future we may add bpf_type_netns_sock program type
which will use exactly the same sock create hook, but will be scoped
by

Re: [PATCH net-next v3 00/12] bnxt_en: Add XDP support.

2017-02-06 Thread Jakub Kicinski

On Mon,  6 Feb 2017 16:55:31 -0500, Michael Chan wrote:
> The first 10 patches refactor the code (rx/tx code paths and ring logic)
> and add the basic infrastructure to support XDP.  The 11th patch adds
> basic ndo_xdp to support XDP_DROP and XDP_PASS only.  The 12th patch
> completes the series with XDP_TX.
> 
> Thanks to Andy Gospodarek for testing and uncovering some bugs.
> 
> v3: Removed Kconfig option.
> Pass modified offset and length to stack for XDP_PASS.

Looks correct now, thanks!

Re: [PATCH net] bpf: expose netns inode to bpf programs

2017-02-06 Thread Alexei Starovoitov

On Sat, Feb 04, 2017 at 09:05:29PM -0800, Andy Lutomirski wrote:
> 
> I'm not saying that at all.  I'm saying that this use case sounds
> valid, but maybe it could be solved differently.  Here are some ideas:

Great. Combining multiple threads. Replied in bpf_sk_netns_id thread.

Re: [PATCH net-next 1/7] openvswitch: Use inverted tuple in ovs_ct_find_existing() if NATted.

2017-02-06 Thread Joe Stringer

On 5 February 2017 at 14:28, David Miller  wrote:
> From: Jarno Rajahalme 
> Date: Thu,  2 Feb 2017 17:10:00 -0800
>
>> This does not match either of the conntrack tuples above.  Normally
>> this does not matter, as the conntrack lookup was already done using
>> the tuple (B,A), but if the current packet does not match any flow in
>> the OVS datapath, the packet is sent to userspace via an upcall,
>> during which the packet's skb is freed, and the conntrack entry
>> pointer in the skb is lost.
>
> This is the real bug.
>
> If the metadata for a packet is important, as it obviously is here,
> then upcalls should preserve that metadata across the upcall.  This
> is exactly how NF_QUEUE handles this problem and so should OVS.

Looks like the patch #5 provides this preservation across upcall, so
this patch can be converted to use the key->ct.orig_* from that patch
instead of doing the invert.

Re: [RFC] igmp: address pmc kmemleak from on igmpv3_del_delrec()

2017-02-06 Thread Cong Wang

On Fri, Feb 3, 2017 at 1:20 PM, Luis R. Rodriguez  wrote:
> When we igmpv3_add_delrec() we kzalloc the pmc, but when users
> calligmpv3_del_delrec() we never free the pmc. This was caught
> by the following kmemleak splat:
>
> unreferenced object 0x99666ff43b40 (size 192):
>   comm "systemd-resolve", pid 1258, jiffies 4309905600 (age 2138.352s)
>   hex dump (first 32 bytes):
> 00 6a 64 72 66 99 ff ff e0 00 00 fc 00 00 00 00  .jdrf...
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
>   backtrace:
> [] kmemleak_alloc+0x4a/0xa0
> [] kmem_cache_alloc_trace+0x107/0x240
> [] igmp_group_dropped+0xfd/0x270
> [] ip_mc_dec_group+0xaf/0x110
> [] ip_mc_leave_group+0xb6/0x140
> [] do_ip_setsockopt.isra.13+0x4c7/0xed0
> [] ip_setsockopt+0x34/0xb0
> [] udp_setsockopt+0x1b/0x30
> [] sock_common_setsockopt+0x14/0x20
> [] SyS_setsockopt+0x80/0xe0
> [] do_syscall_64+0x5b/0xc0
> [] return_from_SYSCALL_64+0x0/0x6a
> [] 0x
>
> Signed-off-by: Luis R. Rodriguez 
> ---
>
> I can reproduce this over time on a qemu box running next-20170125.
> After running this for a while I no longer see the splat. This needs
> confirmation form folks more familiar with the code, hence RFC. If
> this is a real fix we need appropriate tags for the patch.


Looks good to me. Adding some people who recent touched it to CC.

>
>  net/ipv4/igmp.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
> index 5b15459955f8..44fd86de2823 100644
> --- a/net/ipv4/igmp.c
> +++ b/net/ipv4/igmp.c
> @@ -1172,6 +1172,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, 
> struct ip_mc_list *im)
> psf->sf_crcount = im->crcount;
> }
> in_dev_put(pmc->interface);
> +   kfree(pmc);
> }
> spin_unlock_bh(>lock);
>  }
> --
> 2.11.0
>

Re: [PATCH net-next] net: phy: Allow splitting MDIO bus/device support from PHYs

2017-02-06 Thread Florian Fainelli

On 02/06/2017 04:21 PM, Russell King - ARM Linux wrote:
> On Mon, Feb 06, 2017 at 04:11:12PM -0800, Florian Fainelli wrote:
>> Introduce a new configuration symbol: MDIO_DEVICE which allows building
>> the MDIO devices and bus code, without pulling in the entire Ethernet
>> PHY library and devices code.
>>
>> PHYLIB nows select MDIO_DEVICE and the relevant Makefile files are
>> updated to reflect that.
> 
> I think you've been a little too quick off the mark posting this -
> see my points raised in reply to your RFC.  This will cause problems
> when built as a module, not least the missing MODULE_LICENSE()s.
> 

Yes, good point, let me fix that.
-- 
Florian

Re: [PATCH net-next] net: phy: Allow splitting MDIO bus/device support from PHYs

2017-02-06 Thread Russell King - ARM Linux

On Mon, Feb 06, 2017 at 04:11:12PM -0800, Florian Fainelli wrote:
> Introduce a new configuration symbol: MDIO_DEVICE which allows building
> the MDIO devices and bus code, without pulling in the entire Ethernet
> PHY library and devices code.
> 
> PHYLIB nows select MDIO_DEVICE and the relevant Makefile files are
> updated to reflect that.

I think you've been a little too quick off the mark posting this -
see my points raised in reply to your RFC.  This will cause problems
when built as a module, not least the missing MODULE_LICENSE()s.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

Re: [RFC net-next] net: phy: Allow splitting MDIO bus/device support from PHYs

2017-02-06 Thread Russell King - ARM Linux

On Mon, Feb 06, 2017 at 04:01:27PM -0800, Florian Fainelli wrote:
> On 01/30/2017 07:29 PM, Florian Fainelli wrote:
> > Introduce a new configuration symbol: MDIO_DEVICE which allows building
> > the MDIO devices and bus code, without pulling in the entire Ethernet
> > PHY library and devices code.
> > 
> > PHYLIB nows select MDIO_DEVICE and the relevant Makefile files are
> > updated to reflect that.
> > 
> > Signed-off-by: Florian Fainelli 
> 
> Andrew, Russell, does that seem sensible to you, shall I re-submit this
> as a proper patch?

I think the only comment I have is that I'd recommend that mdio_bus
and mdio_device end up as one module rather than two for these reasons:

1. they're each relatively small on their own
2. their functionality is tightly related
3. I wonder about the safety of having mdio_device_release() in a
   separate module from the bus_type stuff.

as far as (3) goes, there isn't a good way to ensure that the last
device has been released, and bus_type's (even though they normally
end up as static data) are refcounted objects just like devices, so
I wonder whether allowing driver model bus stuff to be removable is
a good idea.

To that end, I'd add a module_init() for mdio_bus_init() but remove
mdio_bus_exit() entirely, which should mean that the module loader
considers the module non-removable.

Lastly, as this is becoming (a) separate module(s), MODULE_LICENSE()
as an absolute minimum will be required.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

Re: [PATCH net-next 1/4] net: mv643xx_eth: Do not clobber PHY link outside of state machine

2017-02-06 Thread Andrew Lunn

On Mon, Feb 06, 2017 at 03:55:20PM -0800, Florian Fainelli wrote:
> Calling phy_read_status() means that we may call into
> genphy_read_status() which in turn will use genphy_update_link() which
> can make changes to phydev->link outside of the state machine's state
> transitions. This is an invalid behavior that is now caught as of
> 811a919135b9 ("phy state machine: failsafe leave invalid RUNNING state")
> 
> Signed-off-by: Florian Fainelli 

Reviewed-by: Andrew Lunn 

Andrew

[PATCH net-next] net: phy: Allow splitting MDIO bus/device support from PHYs

2017-02-06 Thread Florian Fainelli

Introduce a new configuration symbol: MDIO_DEVICE which allows building
the MDIO devices and bus code, without pulling in the entire Ethernet
PHY library and devices code.

PHYLIB nows select MDIO_DEVICE and the relevant Makefile files are
updated to reflect that.

Signed-off-by: Florian Fainelli 
---
 drivers/net/Makefile   |  2 +-
 drivers/net/phy/Kconfig| 59 ++
 drivers/net/phy/Makefile   |  3 ++-
 drivers/net/phy/mdio_bus.c |  2 ++
 4 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 7336cbd3ef5d..a701e390d48f 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_MII) += mii.o
 obj-$(CONFIG_MDIO) += mdio.o
 obj-$(CONFIG_NET) += Space.o loopback.o
 obj-$(CONFIG_NETCONSOLE) += netconsole.o
-obj-$(CONFIG_PHYLIB) += phy/
+obj-$(CONFIG_MDIO_DEVICE) += phy/
 obj-$(CONFIG_RIONET) += rionet.o
 obj-$(CONFIG_NET_TEAM) += team/
 obj-$(CONFIG_TUN) += tun.o
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 8dbd59baa34d..01152fb9cb76 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -2,33 +2,12 @@
 # PHY Layer Configuration
 #
 
-menuconfig PHYLIB
-   tristate "PHY Device support and infrastructure"
-   depends on NETDEVICES
+menuconfig MDIO_DEVICE
+   tristate "MDIO bus device drivers"
help
- Ethernet controllers are usually attached to PHY
- devices.  This option provides infrastructure for
- managing PHY devices.
-
-if PHYLIB
-
-config SWPHY
-   bool
-
-config LED_TRIGGER_PHY
-   bool "Support LED triggers for tracking link state"
-   depends on LEDS_TRIGGERS
-   ---help---
- Adds support for a set of LED trigger events per-PHY.  Link
- state change will trigger the events, for consumption by an
- LED class driver.  There are triggers for each link speed currently
- supported by the phy, and are of the form:
-  ::
-
- Where speed is in the form:
-   Mbps or Gbps
+  MDIO devices and driver infrastructure code.
 
-comment "MDIO bus device drivers"
+if MDIO_DEVICE
 
 config MDIO_BCM_IPROC
tristate "Broadcom iProc MDIO bus controller"
@@ -160,6 +139,36 @@ config MDIO_XGENE
  This module provides a driver for the MDIO busses found in the
  APM X-Gene SoC's.
 
+endif
+
+menuconfig PHYLIB
+   tristate "PHY Device support and infrastructure"
+   depends on NETDEVICES
+   select MDIO_DEVICE
+   help
+ Ethernet controllers are usually attached to PHY
+ devices.  This option provides infrastructure for
+ managing PHY devices.
+
+if PHYLIB
+
+config SWPHY
+   bool
+
+config LED_TRIGGER_PHY
+   bool "Support LED triggers for tracking link state"
+   depends on LEDS_TRIGGERS
+   ---help---
+ Adds support for a set of LED trigger events per-PHY.  Link
+ state change will trigger the events, for consumption by an
+ LED class driver.  There are triggers for each link speed currently
+ supported by the phy, and are of the form:
+  ::
+
+ Where speed is in the form:
+   Mbps or Gbps
+
+
 comment "MII PHY device drivers"
 
 config AMD_PHY
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 356859ac7c18..441d228f53ed 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -1,6 +1,7 @@
 # Makefile for Linux PHY drivers and MDIO bus drivers
 
-libphy-y   := phy.o phy_device.o mdio_bus.o mdio_device.o
+libphy-y   := phy.o phy_device.o
+obj-$(CONFIG_MDIO_DEVICE)  += mdio_device.o mdio_bus.o
 libphy-$(CONFIG_SWPHY) += swphy.o
 libphy-$(CONFIG_LED_TRIGGER_PHY)   += phy_led_triggers.o
 
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 653d076eafe5..fa6bd2a2ce3e 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -644,9 +644,11 @@ int __init mdio_bus_init(void)
 
return ret;
 }
+EXPORT_SYMBOL_GPL(mdio_bus_init);
 
 void mdio_bus_exit(void)
 {
class_unregister(_bus_class);
bus_unregister(_bus_type);
 }
+EXPORT_SYMBOL_GPL(mdio_bus_exit);
-- 
2.9.3

Re: [PATCH net-next 4/4] net: dsa: Do not clobber PHY link outside of state machine

2017-02-06 Thread Andrew Lunn

On Mon, Feb 06, 2017 at 03:55:23PM -0800, Florian Fainelli wrote:
> Calling phy_read_status() means that we may call into
> genphy_read_status() which in turn will use genphy_update_link() which
> can make changes to phydev->link outside of the state machine's state
> transitions. This is an invalid behavior that is now caught as of
> 811a919135b9 ("phy state machine: failsafe leave invalid RUNNING state")
> 
> Reported-by: Zefir Kurtisi 
> Signed-off-by: Florian Fainelli 
> ---
>  net/dsa/slave.c | 10 +++---
>  1 file changed, 3 insertions(+), 7 deletions(-)
> 
> diff --git a/net/dsa/slave.c b/net/dsa/slave.c
> index 09fc3e9462c1..4b6fb6b14de4 100644
> --- a/net/dsa/slave.c
> +++ b/net/dsa/slave.c
> @@ -651,14 +651,10 @@ dsa_slave_get_link_ksettings(struct net_device *dev,
>struct ethtool_link_ksettings *cmd)
>  {
>   struct dsa_slave_priv *p = netdev_priv(dev);
> - int err;
> + int err = -EOPNOTSUPP;
>  
> - err = -EOPNOTSUPP;
> - if (p->phy != NULL) {
> - err = phy_read_status(p->phy);
> - if (err == 0)
> - err = phy_ethtool_ksettings_get(p->phy, cmd);
> - }
> + if (p->phy != NULL)
> + err = phy_ethtool_ksettings_get(p->phy, cmd);

Hi Florian

So what we are effectively doing is returning the state from the last
poll/interrupt. The poll information could be up to 1 second out of
date, but those PHYs using interrupts should give more fresh
information.

Seems reasonable.

Reviewed-by: Andrew Lunn 

Andrew

Re: [PATCH v3 net] bpf: add bpf_sk_netns_id() helper

2017-02-06 Thread Daniel Borkmann


On 02/07/2017 01:02 AM, Alexei Starovoitov wrote:

On 2/6/17 3:39 PM, Daniel Borkmann wrote:

On 02/04/2017 04:34 AM, Alexei Starovoitov wrote:
[...]

+BPF_CALL_1(bpf_skb_netns_id, struct sk_buff *, skb)
+{
+struct net_device *dev = skb->dev;
+
+if (!dev)
+return 0;
+return proc_get_ns_devid_inum(_net(dev)->ns);
+}
+
+static const struct bpf_func_proto bpf_skb_netns_id_proto = {
+.func= bpf_skb_netns_id,
+.gpl_only= false,
+.ret_type= RET_INTEGER,
+.arg1_type= ARG_PTR_TO_CTX,
+};
+
  static const struct bpf_func_proto *
  sk_filter_func_proto(enum bpf_func_id func_id)
  {
@@ -2620,6 +2649,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
  case BPF_FUNC_trace_printk:
  if (capable(CAP_SYS_ADMIN))
  return bpf_get_trace_printk_proto();
+case BPF_FUNC_sk_netns_id:
+return _skb_netns_id_proto;
  default:
  return NULL;
  }


Btw, I think here's an oversight that would still need to be
fixed. Above would mean that trace printk from unprivileged would
fall through and use _skb_netns_id_proto as proto now instead
of NULL. So BPF_FUNC_sk_netns_id needs to be placed above the
BPF_FUNC_trace_printk case, not in its fall-through path. Looks
like Chenbo in his get_socket_cookie missed this, too. Other than
that BPF bits seem good to me.


Ahh, right. Good catch.
I'll add 'else return NULL;' otherwise somebody might step on it again.
Thanks Daniel!


I guess an explicit comment "/* fall-through */" would also be fine
and get noticed. Thanks!


Eric,
still waiting for your review of nsfs.c bits.

Re: [RFC net-next] net: phy: Allow splitting MDIO bus/device support from PHYs

2017-02-06 Thread Andrew Lunn

> > Introduce a new configuration symbol: MDIO_DEVICE which allows building
> > the MDIO devices and bus code, without pulling in the entire Ethernet
> > PHY library and devices code.
> > 
> > PHYLIB nows select MDIO_DEVICE and the relevant Makefile files are
> > updated to reflect that.
> > 
> > Signed-off-by: Florian Fainelli 
> 
> Andrew, Russell, does that seem sensible to you, shall I re-submit this
> as a proper patch?

Hi Florian

It does make sense, given that we have some MDIO busses not used for
networking.

Andrew

Re: [PATCH v3 net] bpf: add bpf_sk_netns_id() helper

2017-02-06 Thread Alexei Starovoitov


On 2/6/17 3:39 PM, Daniel Borkmann wrote:

On 02/04/2017 04:34 AM, Alexei Starovoitov wrote:
[...]

+BPF_CALL_1(bpf_skb_netns_id, struct sk_buff *, skb)
+{
+struct net_device *dev = skb->dev;
+
+if (!dev)
+return 0;
+return proc_get_ns_devid_inum(_net(dev)->ns);
+}
+
+static const struct bpf_func_proto bpf_skb_netns_id_proto = {
+.func= bpf_skb_netns_id,
+.gpl_only= false,
+.ret_type= RET_INTEGER,
+.arg1_type= ARG_PTR_TO_CTX,
+};
+
  static const struct bpf_func_proto *
  sk_filter_func_proto(enum bpf_func_id func_id)
  {
@@ -2620,6 +2649,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
  case BPF_FUNC_trace_printk:
  if (capable(CAP_SYS_ADMIN))
  return bpf_get_trace_printk_proto();
+case BPF_FUNC_sk_netns_id:
+return _skb_netns_id_proto;
  default:
  return NULL;
  }


Btw, I think here's an oversight that would still need to be
fixed. Above would mean that trace printk from unprivileged would
fall through and use _skb_netns_id_proto as proto now instead
of NULL. So BPF_FUNC_sk_netns_id needs to be placed above the
BPF_FUNC_trace_printk case, not in its fall-through path. Looks
like Chenbo in his get_socket_cookie missed this, too. Other than
that BPF bits seem good to me.


Ahh, right. Good catch.
I'll add 'else return NULL;' otherwise somebody might step on it again.
Thanks Daniel!

Eric,
still waiting for your review of nsfs.c bits.

Re: [RFC net-next] net: phy: Allow splitting MDIO bus/device support from PHYs

2017-02-06 Thread Florian Fainelli

On 01/30/2017 07:29 PM, Florian Fainelli wrote:
> Introduce a new configuration symbol: MDIO_DEVICE which allows building
> the MDIO devices and bus code, without pulling in the entire Ethernet
> PHY library and devices code.
> 
> PHYLIB nows select MDIO_DEVICE and the relevant Makefile files are
> updated to reflect that.
> 
> Signed-off-by: Florian Fainelli 

Andrew, Russell, does that seem sensible to you, shall I re-submit this
as a proper patch?
-- 
Florian

[PATCH net-next 0/4] net: Incorrect use of phy_read_status()

2017-02-06 Thread Florian Fainelli

Hi all,

This patch series removes incorrect uses of phy_read_status() which can clobber
the PHY device link while we are executing with the state machine running.

greth was potentially another candidate, but it does funky stuff with
auto-negotation that I am still trying to understand.

Florian Fainelli (4):
  net: mv643xx_eth: Do not clobber PHY link outside of state machine
  net: pxa168_eth: Do not clobber PHY link outside of state machine
  net: netcp: Do not clobber PHY link outside of state machine
  net: dsa: Do not clobber PHY link outside of state machine

 drivers/net/ethernet/marvell/mv643xx_eth.c |  4 +---
 drivers/net/ethernet/marvell/pxa168_eth.c  | 20 +---
 drivers/net/ethernet/ti/netcp_ethss.c  |  2 --
 net/dsa/slave.c| 10 +++---
 4 files changed, 5 insertions(+), 31 deletions(-)

-- 
2.9.3

[PATCH net-next 1/4] net: mv643xx_eth: Do not clobber PHY link outside of state machine

2017-02-06 Thread Florian Fainelli

Calling phy_read_status() means that we may call into
genphy_read_status() which in turn will use genphy_update_link() which
can make changes to phydev->link outside of the state machine's state
transitions. This is an invalid behavior that is now caught as of
811a919135b9 ("phy state machine: failsafe leave invalid RUNNING state")

Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/marvell/mv643xx_eth.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c 
b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 20cb7f0de601..25642dee49d3 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -1504,9 +1504,7 @@ mv643xx_eth_get_link_ksettings_phy(struct 
mv643xx_eth_private *mp,
int err;
u32 supported, advertising;
 
-   err = phy_read_status(dev->phydev);
-   if (err == 0)
-   err = phy_ethtool_ksettings_get(dev->phydev, cmd);
+   err = phy_ethtool_ksettings_get(dev->phydev, cmd);
 
/*
 * The MAC does not support 1000baseT_Half.
-- 
2.9.3

[PATCH net-next 4/4] net: dsa: Do not clobber PHY link outside of state machine

2017-02-06 Thread Florian Fainelli

Calling phy_read_status() means that we may call into
genphy_read_status() which in turn will use genphy_update_link() which
can make changes to phydev->link outside of the state machine's state
transitions. This is an invalid behavior that is now caught as of
811a919135b9 ("phy state machine: failsafe leave invalid RUNNING state")

Reported-by: Zefir Kurtisi 
Signed-off-by: Florian Fainelli 
---
 net/dsa/slave.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 09fc3e9462c1..4b6fb6b14de4 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -651,14 +651,10 @@ dsa_slave_get_link_ksettings(struct net_device *dev,
 struct ethtool_link_ksettings *cmd)
 {
struct dsa_slave_priv *p = netdev_priv(dev);
-   int err;
+   int err = -EOPNOTSUPP;
 
-   err = -EOPNOTSUPP;
-   if (p->phy != NULL) {
-   err = phy_read_status(p->phy);
-   if (err == 0)
-   err = phy_ethtool_ksettings_get(p->phy, cmd);
-   }
+   if (p->phy != NULL)
+   err = phy_ethtool_ksettings_get(p->phy, cmd);
 
return err;
 }
-- 
2.9.3

[PATCH net-next 3/4] net: netcp: Do not clobber PHY link outside of state machine

2017-02-06 Thread Florian Fainelli

Calling phy_read_status() means that we may call into
genphy_read_status() which in turn will use genphy_update_link() which
can make changes to phydev->link outside of the state machine's state
transitions. This is an invalid behavior that is now caught as off
811a919135b9 ("phy state machine: failsafe leave invalid RUNNING state")

Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/ti/netcp_ethss.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/netcp_ethss.c 
b/drivers/net/ethernet/ti/netcp_ethss.c
index f7bb241b17ab..eece3e2eec14 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -2313,7 +2313,6 @@ static int gbe_slave_open(struct gbe_intf *gbe_intf)
dev_dbg(priv->dev, "phy found: id is: 0x%s\n",
phydev_name(slave->phy));
phy_start(slave->phy);
-   phy_read_status(slave->phy);
}
return 0;
 }
@@ -3119,7 +3118,6 @@ static void init_secondary_ports(struct gbe_priv *gbe_dev,
dev_dbg(dev, "phy found: id is: 0x%s\n",
phydev_name(slave->phy));
phy_start(slave->phy);
-   phy_read_status(slave->phy);
}
}
 }
-- 
2.9.3

[PATCH net-next 2/4] net: pxa168_eth: Do not clobber PHY link outside of state machine

2017-02-06 Thread Florian Fainelli

Calling phy_read_status() means that we may call into
genphy_read_status() which in turn will use genphy_update_link() which
can make changes to phydev->link outside of the state machine's state
transitions. This is an invalid behavior that is now caught as of
811a919135b9 ("phy state machine: failsafe leave invalid RUNNING state")

Since we don't have anything special, switch to the generic
phy_ethtool_get_link_ksettings() function now.

Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/marvell/pxa168_eth.c | 20 +---
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c 
b/drivers/net/ethernet/marvell/pxa168_eth.c
index 3376a19f1e19..28cb36d9e50a 100644
--- a/drivers/net/ethernet/marvell/pxa168_eth.c
+++ b/drivers/net/ethernet/marvell/pxa168_eth.c
@@ -274,8 +274,6 @@ enum hash_table_entry {
HASH_ENTRY_RECEIVE_DISCARD_BIT = 2
 };
 
-static int pxa168_get_link_ksettings(struct net_device *dev,
-struct ethtool_link_ksettings *cmd);
 static int pxa168_init_hw(struct pxa168_eth_private *pep);
 static int pxa168_init_phy(struct net_device *dev);
 static void eth_port_reset(struct net_device *dev);
@@ -987,10 +985,6 @@ static int pxa168_init_phy(struct net_device *dev)
if (err)
return err;
 
-   err = pxa168_get_link_ksettings(dev, );
-   if (err)
-   return err;
-
cmd.base.phy_address = pep->phy_addr;
cmd.base.speed = pep->phy_speed;
cmd.base.duplex = pep->phy_duplex;
@@ -1370,18 +1364,6 @@ static int pxa168_eth_do_ioctl(struct net_device *dev, 
struct ifreq *ifr,
return -EOPNOTSUPP;
 }
 
-static int pxa168_get_link_ksettings(struct net_device *dev,
-struct ethtool_link_ksettings *cmd)
-{
-   int err;
-
-   err = phy_read_status(dev->phydev);
-   if (err == 0)
-   err = phy_ethtool_ksettings_get(dev->phydev, cmd);
-
-   return err;
-}
-
 static void pxa168_get_drvinfo(struct net_device *dev,
   struct ethtool_drvinfo *info)
 {
@@ -1396,7 +1378,7 @@ static const struct ethtool_ops pxa168_ethtool_ops = {
.nway_reset = phy_ethtool_nway_reset,
.get_link   = ethtool_op_get_link,
.get_ts_info= ethtool_op_get_ts_info,
-   .get_link_ksettings = pxa168_get_link_ksettings,
+   .get_link_ksettings = phy_ethtool_get_link_ksettings,
.set_link_ksettings = phy_ethtool_set_link_ksettings,
 };
 
-- 
2.9.3

RE: [Intel-wired-lan] [PATCH] net: intel: i40evf: use new api ethtool_{get|set}_link_ksettings

2017-02-06 Thread Wyborny, Carolyn

> -Original Message-
> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@lists.osuosl.org] On
> Behalf Of Philippe Reynes
> Sent: Saturday, February 04, 2017 2:49 PM
> To: Kirsher, Jeffrey T ; da...@davemloft.net
> Cc: netdev@vger.kernel.org; intel-wired-...@lists.osuosl.org; linux-
> ker...@vger.kernel.org; Philippe Reynes 
> Subject: [Intel-wired-lan] [PATCH] net: intel: i40evf: use new api
> ethtool_{get|set}_link_ksettings
> 
> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
Thanks for the patch Phillippe,

We have an internal patch in process for this functionality.  It should be out 
very soon.  I thought it was upstream already but it must not be.  I'll find it 
and get it expedited.

Thanks again,

Carolyn

Carolyn Wyborny 
Linux Development 
Networking Division 
Intel Corporation

Re: net/kcm: WARNING in kcm_write_msgs

2017-02-06 Thread Cong Wang

On Mon, Feb 6, 2017 at 4:43 AM, Dmitry Vyukov  wrote:
> [resending as plain text]
>
> Hello,
>
> The following program triggers WARNING in kcm_write_msgs:
>
> WARNING: CPU: 3 PID: 2936 at net/kcm/kcmsock.c:627
> kcm_write_msgs+0x12e3/0x1b90 net/kcm/kcmsock.c:627
> CPU: 3 PID: 2936 Comm: a.out Not tainted 4.10.0-rc6+ #209
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:15 [inline]
>  dump_stack+0x2ee/0x3ef lib/dump_stack.c:51
>  panic+0x1fb/0x412 kernel/panic.c:179
>  __warn+0x1c4/0x1e0 kernel/panic.c:539
>  warn_slowpath_null+0x2c/0x40 kernel/panic.c:582
>  kcm_write_msgs+0x12e3/0x1b90 net/kcm/kcmsock.c:627
>  kcm_sendmsg+0x163a/0x2200 net/kcm/kcmsock.c:1029
>  sock_sendmsg_nosec net/socket.c:635 [inline]
>  sock_sendmsg+0xca/0x110 net/socket.c:645
>  sock_write_iter+0x326/0x600 net/socket.c:848
>  new_sync_write fs/read_write.c:499 [inline]
>  __vfs_write+0x483/0x740 fs/read_write.c:512
>  vfs_write+0x187/0x530 fs/read_write.c:560
>  SYSC_write fs/read_write.c:607 [inline]
>  SyS_write+0xfb/0x230 fs/read_write.c:599
>  entry_SYSCALL_64_fastpath+0x1f/0xc2
[...]
>   syscall(__NR_write, sock2, 0x208aaf27ul, 0x0ul);

Looks like len == 0 case is not handled correctly in kcm_sendmsg().
The attached patch fixes it, but I am not sure if it is correct in all
cases yet, the logic is complicated.
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 7e08a4d..64f0e85 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -929,23 +929,25 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr 
*msg, size_t len)
goto out_error;
}
 
-   /* New message, alloc head skb */
-   head = alloc_skb(0, sk->sk_allocation);
-   while (!head) {
-   kcm_push(kcm);
-   err = sk_stream_wait_memory(sk, );
-   if (err)
-   goto out_error;
-
+   if (msg_data_left(msg)) {
+   /* New message, alloc head skb */
head = alloc_skb(0, sk->sk_allocation);
-   }
+   while (!head) {
+   kcm_push(kcm);
+   err = sk_stream_wait_memory(sk, );
+   if (err)
+   goto out_error;
 
-   skb = head;
+   head = alloc_skb(0, sk->sk_allocation);
+   }
 
-   /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
-* csum_and_copy_from_iter from skb_do_copy_data_nocache.
-*/
-   skb->ip_summed = CHECKSUM_UNNECESSARY;
+   skb = head;
+
+   /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
+* csum_and_copy_from_iter from skb_do_copy_data_nocache.
+*/
+   skb->ip_summed = CHECKSUM_UNNECESSARY;
+   }
 
 start:
while (msg_data_left(msg)) {
@@ -1018,10 +1020,12 @@ static int kcm_sendmsg(struct socket *sock, struct 
msghdr *msg, size_t len)
if (eor) {
bool not_busy = skb_queue_empty(>sk_write_queue);
 
-   /* Message complete, queue it on send buffer */
-   __skb_queue_tail(>sk_write_queue, head);
-   kcm->seq_skb = NULL;
-   KCM_STATS_INCR(kcm->stats.tx_msgs);
+   if (head) {
+   /* Message complete, queue it on send buffer */
+   __skb_queue_tail(>sk_write_queue, head);
+   kcm->seq_skb = NULL;
+   KCM_STATS_INCR(kcm->stats.tx_msgs);
+   }
 
if (msg->msg_flags & MSG_BATCH) {
kcm->tx_wait_more = true;

[pull request][net-next V2 0/6] Mellanox mlx5 updates 2017-01-31

2017-02-06 Thread Saeed Mahameed

Hi Dave,

This pull request includes two new mlx5 features and two small fixes for 
net-next,
Details are bleow.

Please pull and let me know if there's any problem.

Sorry for the delay on addressing the comments.
v1->v2:
- Addressed the comments on the static checker fix patch
- Squash cacheline patches (it is more correct they come as one).
- Dropped "net/mlx5e: Calc vlan_tag_present only once on xmit" 
  as it doesn't seem to give any added value.

Thanks,
Saeed.

---

The following changes since commit bd092ad1463ca0990581fa992e12a9b0ed295d25:

  Merge branch 'remove-__napi_complete_done' (2017-02-05 16:11:59 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
tags/mlx5-updates-2017-01-31

for you to fetch changes up to 8ca967ab67671f07ac7daef4f854559bc66799a3:

  net/mlx5e: Bring back bfreg uar map dedicated pointer (2017-02-06 18:20:18 
+0200)


mlx5-updates-2017-01-31

This series includes some updates to mlx5 core and ethernet driver.

We got one patch from Or to fix some static checker warnings.

2nd patche from Dan came to add the support for 128B cache line
in the HCA, which will configures the hardware to use 128B alignment only
on systems with 128B cache lines, otherwise it will be kept as the current
default of 64B.

>From me three patches to support no inline copy on TX on ConnectX-5 and
later HCAs.  Starting with two small infrastructure changes and
refactoring patches followed by two patches to add the actual support for
both xmit ndo and XDP xmit routines.
Last patch is a simple fix to return a mistakenly removed pointer from the
SQ structure, which was remove in previous submission of mlx5 4K UAR.

Saeed.


Daniel Jurgens (1):
  net/mlx5: Configure cache line size for start and end padding

Or Gerlitz (1):
  net/mlx5: Fix static checker warnings

Saeed Mahameed (4):
  net/mlx5: TX WQE update
  net/mlx5e: Tx, no inline copy on ConnectX-5
  net/mlx5e: XDP Tx, no inline copy on ConnectX-5
  net/mlx5e: Bring back bfreg uar map dedicated pointer

 drivers/infiniband/hw/mlx5/qp.c   |  6 ++---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 17 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 21 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 20 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 30 +--
 drivers/net/ethernet/mellanox/mlx5/core/main.c|  6 +
 include/linux/mlx5/device.h   |  3 ++-
 include/linux/mlx5/mlx5_ifc.h |  9 ---
 include/linux/mlx5/qp.h   | 16 ++--
 9 files changed, 85 insertions(+), 43 deletions(-)

[net-next V2 3/6] net/mlx5: TX WQE update

2017-02-06 Thread Saeed Mahameed

Add new TX WQE fields for Connect-X5 vlan insertion support,
type and vlan_tci, when type = MLX5_ETH_WQE_INSERT_VLAN the
HW will insert the vlan and prio fields (vlan_tci) to the packet.

Those bits and the inline header fields are mutually exclusive, and
valid only when:
MLX5_CAP_ETH(mdev, wqe_inline_mode) == MLX5_CAP_INLINE_MODE_NOT_REQUIRED
and MLX5_CAP_ETH(mdev, wqe_vlan_insert),
who will be set in ConnectX-5 and later HW generations.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Tariq Toukan 
---
 drivers/infiniband/hw/mlx5/qp.c |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c |  8 
 include/linux/mlx5/mlx5_ifc.h   |  3 ++-
 include/linux/mlx5/qp.h | 16 ++--
 5 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 6a83fb32599d..e31bf11ae64f 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -2984,20 +2984,20 @@ static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg,
 
if (wr->opcode == IB_WR_LSO) {
struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr);
-   int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start);
+   int size_of_inl_hdr_start = sizeof(eseg->inline_hdr.start);
u64 left, leftlen, copysz;
void *pdata = ud_wr->header;
 
left = ud_wr->hlen;
eseg->mss = cpu_to_be16(ud_wr->mss);
-   eseg->inline_hdr_sz = cpu_to_be16(left);
+   eseg->inline_hdr.sz = cpu_to_be16(left);
 
/*
 * check if there is space till the end of queue, if yes,
 * copy all in one shot, otherwise copy till the end of queue,
 * rollback and than the copy the left
 */
-   leftlen = qend - (void *)eseg->inline_hdr_start;
+   leftlen = qend - (void *)eseg->inline_hdr.start;
copysz = min_t(u64, leftlen, left);
 
memcpy(seg - size_of_inl_hdr_start, pdata, copysz);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index fd8dff6acc12..965e69e9ff1e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -687,8 +687,8 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
memset(wqe, 0, sizeof(*wqe));
 
/* copy the inline part */
-   memcpy(eseg->inline_hdr_start, xdp->data, MLX5E_XDP_MIN_INLINE);
-   eseg->inline_hdr_sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
+   memcpy(eseg->inline_hdr.start, xdp->data, MLX5E_XDP_MIN_INLINE);
+   eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
 
dseg = (struct mlx5_wqe_data_seg *)cseg + (MLX5E_XDP_TX_DS_COUNT - 1);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index cfb68371c397..678c07c8fbb0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -284,18 +284,18 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, 
struct sk_buff *skb)
wi->num_bytes = num_bytes;
 
if (skb_vlan_tag_present(skb)) {
-   mlx5e_insert_vlan(eseg->inline_hdr_start, skb, ihs, _data,
+   mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs, _data,
  _len);
ihs += VLAN_HLEN;
} else {
-   memcpy(eseg->inline_hdr_start, skb_data, ihs);
+   memcpy(eseg->inline_hdr.start, skb_data, ihs);
mlx5e_tx_skb_pull_inline(_data, _len, ihs);
}
 
-   eseg->inline_hdr_sz = cpu_to_be16(ihs);
+   eseg->inline_hdr.sz = cpu_to_be16(ihs);
 
ds_cnt  = sizeof(*wqe) / MLX5_SEND_WQE_DS;
-   ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr_start),
+   ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr.start),
   MLX5_SEND_WQE_DS);
dseg= (struct mlx5_wqe_data_seg *)cseg + ds_cnt;
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index cc8ae860cd45..afcd4736d8df 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -577,7 +577,8 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
u8 lro_cap[0x1];
u8 lro_psh_flag[0x1];
u8 lro_time_stamp[0x1];
-   u8 reserved_at_5[0x3];
+   u8 reserved_at_5[0x2];
+   u8 wqe_vlan_insert[0x1];
u8 self_lb_en_modifiable[0x1];
u8 reserved_at_9[0x2];
u8 max_lso_cap[0x5];
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index

[net-next V2 4/6] net/mlx5e: Tx, no inline copy on ConnectX-5

2017-02-06 Thread Saeed Mahameed

ConnectX-5 and later HW generations will report min inline mode ==
MLX5_INLINE_MODE_NONE, which means driver is not required to copy packet
headers to inline fields of TX WQE.

When inline is not required, vlan insertion will be handled in the
TX descriptor rather than copy to inline.

For LSO case driver is still required to copy headers, for the HW to
duplicate on wire.

This will improve CPU utilization and boost TX performance.

Tested with pktgen burst single flow:
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
HCA: Mellanox Technologies MT28800 Family [ConnectX-5 Ex]

Before: 15.1Mpps
After:  17.2Mpps
Improvement: 14%

Signed-off-by: Saeed Mahameed 
Reviewed-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 30 +--
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 1b7fe43ab22b..9cd38401fdc9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1029,9 +1029,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 
sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
sq->max_inline  = param->max_inline;
-   sq->min_inline_mode =
-   MLX5_CAP_ETH(mdev, wqe_inline_mode) == 
MLX5_CAP_INLINE_MODE_VPORT_CONTEXT ?
-   param->min_inline_mode : 0;
+   sq->min_inline_mode = param->min_inline_mode;
 
err = mlx5e_alloc_sq_db(sq, cpu_to_node(c->cpu));
if (err)
@@ -1095,7 +1093,10 @@ static int mlx5e_enable_sq(struct mlx5e_sq *sq, struct 
mlx5e_sq_param *param)
MLX5_SET(sqc,  sqc, tis_num_0, param->type == MLX5E_SQ_ICO ?
   0 : priv->tisn[sq->tc]);
MLX5_SET(sqc,  sqc, cqn,sq->cq.mcq.cqn);
-   MLX5_SET(sqc,  sqc, min_wqe_inline_mode, sq->min_inline_mode);
+
+   if (MLX5_CAP_ETH(mdev, wqe_inline_mode) == 
MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
+   MLX5_SET(sqc,  sqc, min_wqe_inline_mode, sq->min_inline_mode);
+
MLX5_SET(sqc,  sqc, state,  MLX5_SQC_STATE_RST);
MLX5_SET(sqc,  sqc, tis_lst_sz, param->type == MLX5E_SQ_ICO ? 0 : 1);
 
@@ -3533,6 +3534,10 @@ static void mlx5e_build_nic_netdev_priv(struct 
mlx5_core_dev *mdev,
MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
priv->params.tx_max_inline = mlx5e_get_max_inline_cap(mdev);
mlx5_query_min_inline(mdev, >params.tx_min_inline_mode);
+   if (priv->params.tx_min_inline_mode == MLX5_INLINE_MODE_NONE &&
+   !MLX5_CAP_ETH(mdev, wqe_vlan_insert))
+   priv->params.tx_min_inline_mode = MLX5_INLINE_MODE_L2;
+
priv->params.num_tc= 1;
priv->params.rss_hfunc = ETH_RSS_HASH_XOR;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 678c07c8fbb0..f193128bac4b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -154,6 +154,8 @@ static inline unsigned int mlx5e_calc_min_inline(enum 
mlx5_inline_modes mode,
int hlen;
 
switch (mode) {
+   case MLX5_INLINE_MODE_NONE:
+   return 0;
case MLX5_INLINE_MODE_TCP_UDP:
hlen = eth_get_headlen(skb->data, skb_headlen(skb));
if (hlen == ETH_HLEN && !skb_vlan_tag_present(skb))
@@ -283,21 +285,23 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, 
struct sk_buff *skb)
 
wi->num_bytes = num_bytes;
 
-   if (skb_vlan_tag_present(skb)) {
-   mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs, _data,
- _len);
-   ihs += VLAN_HLEN;
-   } else {
-   memcpy(eseg->inline_hdr.start, skb_data, ihs);
-   mlx5e_tx_skb_pull_inline(_data, _len, ihs);
+   ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
+   if (ihs) {
+   if (skb_vlan_tag_present(skb)) {
+   mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs, 
_data, _len);
+   ihs += VLAN_HLEN;
+   } else {
+   memcpy(eseg->inline_hdr.start, skb_data, ihs);
+   mlx5e_tx_skb_pull_inline(_data, _len, ihs);
+   }
+   eseg->inline_hdr.sz = cpu_to_be16(ihs);
+   ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr.start), 
MLX5_SEND_WQE_DS);
+   } else if (skb_vlan_tag_present(skb)) {
+   eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN);
+   eseg->insert.vlan_tci = cpu_to_be16(skb_vlan_tag_get(skb));
}
 
-   eseg->inline_hdr.sz = cpu_to_be16(ihs);
-
-   ds_cnt  = sizeof(*wqe) / MLX5_SEND_WQE_DS;

[net-next V2 1/6] net/mlx5: Fix static checker warnings

2017-02-06 Thread Saeed Mahameed

From: Or Gerlitz 

For some reason, sparse doesn't like using an expression of type (!x)
with a bitwise | and &.  In order to mitigate that, we use a local variable.

This removes the following sparse complaints on the core driver
(and similar ones on the IB driver too):

drivers/net/ethernet/mellanox/mlx5/core/srq.c:83:9: warning: dubious: !x & y
drivers/net/ethernet/mellanox/mlx5/core/srq.c:96:9: warning: dubious: !x & y
drivers/net/ethernet/mellanox/mlx5/core/port.c:59:9: warning: dubious: !x & y
drivers/net/ethernet/mellanox/mlx5/core/vport.c:561:9: warning: dubious: !x & y

Signed-off-by: Or Gerlitz 
Signed-off-by: Matan Barak 
Reported-by: Bart Van Assche 
Signed-off-by: Saeed Mahameed 
---
 include/linux/mlx5/device.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 7b6cd67a263f..dd9a263ed368 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -67,10 +67,11 @@
 
 /* insert a value to a struct */
 #define MLX5_SET(typ, p, fld, v) do { \
+   u32 _v = v; \
BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 32); \
*((__be32 *)(p) + __mlx5_dw_off(typ, fld)) = \
cpu_to_be32((be32_to_cpu(*((__be32 *)(p) + __mlx5_dw_off(typ, fld))) & \
-(~__mlx5_dw_mask(typ, fld))) | (((v) & __mlx5_mask(typ, 
fld)) \
+(~__mlx5_dw_mask(typ, fld))) | (((_v) & __mlx5_mask(typ, 
fld)) \
 << __mlx5_dw_bit_off(typ, fld))); \
 } while (0)
 
-- 
2.11.0

[net-next V2 5/6] net/mlx5e: XDP Tx, no inline copy on ConnectX-5

2017-02-06 Thread Saeed Mahameed

ConnectX-5 and later HW generations will report min inline mode ==
MLX5_INLINE_MODE_NONE, which means driver is not required to copy packet
headers to inline fields of TX WQE.

Avoid copy to inline segment in XDP TX routine when HW inline mode doesn't
require it.

This will improve CPU utilization and boost XDP TX performance.

Tested with xdp2 single flow:
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
HCA: Mellanox Technologies MT28800 Family [ConnectX-5 Ex]

Before: 7.4Mpps
After:  7.8Mpps
Improvement: 5%

Signed-off-by: Saeed Mahameed 
Reviewed-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  3 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  3 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 20 +---
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9b23d3329847..8be4b12b5545 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -120,8 +120,7 @@
 #define MLX5E_XDP_IHS_DS_COUNT \
DIV_ROUND_UP(MLX5E_XDP_MIN_INLINE - 2, MLX5_SEND_WQE_DS)
 #define MLX5E_XDP_TX_DS_COUNT \
-   (MLX5E_XDP_IHS_DS_COUNT + \
-(sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS) + 1 /* SG DS */)
+   ((sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS) + 1 /* SG DS */)
 #define MLX5E_XDP_TX_WQEBBS \
DIV_ROUND_UP(MLX5E_XDP_TX_DS_COUNT, MLX5_SEND_WQEBB_NUM_DS)
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 9cd38401fdc9..ed230757d9c5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1806,8 +1806,7 @@ static void mlx5e_build_xdpsq_param(struct mlx5e_priv 
*priv,
MLX5_SET(wq, wq, log_wq_sz, priv->params.log_sq_size);
 
param->max_inline = priv->params.tx_max_inline;
-   /* FOR XDP SQs will support only L2 inline mode */
-   param->min_inline_mode = MLX5_INLINE_MODE_NONE;
+   param->min_inline_mode = priv->params.tx_min_inline_mode;
param->type = MLX5E_SQ_XDP;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 965e69e9ff1e..b039b87742a6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -657,9 +657,10 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq 
*rq,
struct mlx5_wqe_ctrl_seg *cseg = >ctrl;
struct mlx5_wqe_eth_seg  *eseg = >eth;
struct mlx5_wqe_data_seg *dseg;
+   u8 ds_cnt = MLX5E_XDP_TX_DS_COUNT;
 
ptrdiff_t data_offset = xdp->data - xdp->data_hard_start;
-   dma_addr_t dma_addr  = di->addr + data_offset + MLX5E_XDP_MIN_INLINE;
+   dma_addr_t dma_addr  = di->addr + data_offset;
unsigned int dma_len = xdp->data_end - xdp->data;
 
if (unlikely(dma_len < MLX5E_XDP_MIN_INLINE ||
@@ -680,17 +681,22 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq 
*rq,
return false;
}
 
-   dma_len -= MLX5E_XDP_MIN_INLINE;
dma_sync_single_for_device(sq->pdev, dma_addr, dma_len,
   PCI_DMA_TODEVICE);
 
memset(wqe, 0, sizeof(*wqe));
 
-   /* copy the inline part */
-   memcpy(eseg->inline_hdr.start, xdp->data, MLX5E_XDP_MIN_INLINE);
-   eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
+   dseg = (struct mlx5_wqe_data_seg *)eseg + 1;
+   /* copy the inline part if required */
+   if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) {
+   memcpy(eseg->inline_hdr.start, xdp->data, MLX5E_XDP_MIN_INLINE);
+   eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
+   dma_len  -= MLX5E_XDP_MIN_INLINE;
+   dma_addr += MLX5E_XDP_MIN_INLINE;
 
-   dseg = (struct mlx5_wqe_data_seg *)cseg + (MLX5E_XDP_TX_DS_COUNT - 1);
+   ds_cnt   += MLX5E_XDP_IHS_DS_COUNT;
+   dseg++;
+   }
 
/* write the dma part */
dseg->addr   = cpu_to_be64(dma_addr);
@@ -698,7 +704,7 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
dseg->lkey   = sq->mkey_be;
 
cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_SEND);
-   cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | MLX5E_XDP_TX_DS_COUNT);
+   cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
 
sq->db.xdp.di[pi] = *di;
wi->opcode = MLX5_OPCODE_SEND;
-- 
2.11.0

[net-next V2 6/6] net/mlx5e: Bring back bfreg uar map dedicated pointer

2017-02-06 Thread Saeed Mahameed

4K Uar series modified the mlx5e driver to use the new bfreg API,
and mistakenly removed the sq->uar_map iomem data path dedicated
pointer, which was meant to be read from xmit path for cache locality
utilization.

Fix that by returning that pointer to the SQ struct.

Fixes: 7309cb4ad71e ("IB/mlx5: Support 4k UAR for libmlx5")
Signed-off-by: Saeed Mahameed 
Reviewed-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 5 +++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 8be4b12b5545..95ca03c0d9f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -475,6 +475,7 @@ struct mlx5e_sq {
/* read only */
struct mlx5_wq_cyc wq;
u32dma_fifo_mask;
+   void __iomem  *uar_map;
struct netdev_queue   *txq;
u32sqn;
u16bf_buf_size;
@@ -831,9 +832,9 @@ static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
 */
wmb();
if (bf_sz)
-   __iowrite64_copy(sq->bfreg.map + ofst, ctrl, bf_sz);
+   __iowrite64_copy(sq->uar_map + ofst, ctrl, bf_sz);
else
-   mlx5_write64((__be32 *)ctrl, sq->bfreg.map + ofst, NULL);
+   mlx5_write64((__be32 *)ctrl, sq->uar_map + ofst, NULL);
/* flush the write-combining mapped buffer */
wmb();
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ed230757d9c5..3cce6281e075 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1016,6 +1016,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
if (err)
return err;
 
+   sq->uar_map = sq->bfreg.map;
param->wq.db_numa_node = cpu_to_node(c->cpu);
 
err = mlx5_wq_cyc_create(mdev, >wq, sqc_wq, >wq,
-- 
2.11.0

[net-next V2 2/6] net/mlx5: Configure cache line size for start and end padding

2017-02-06 Thread Saeed Mahameed

From: Daniel Jurgens 

There is a hardware feature that will pad the start or end of a DMA to
be cache line aligned to avoid RMWs on the last cache line. The default
cache line size setting for this feature is 64B. This change configures
the hardware to use 128B alignment on systems with 128B cache lines.

In addition we lower bound MPWRQ stride by HCA cacheline in mlx5e,
MPWRQ stride should be at least the HCA cacheline, the current default
is 64B and in case HCA_CAP.cach_line_128byte capability is set, MPWRQ RX
stride will automatically be aligned to 128B.

Signed-off-by: Daniel Jurgens 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 9 +++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/main.c| 6 ++
 include/linux/mlx5/mlx5_ifc.h | 6 --
 4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9b52c58cd528..9b23d3329847 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -70,8 +70,13 @@
 
 #define MLX5_RX_HEADROOM NET_SKB_PAD
 
-#define MLX5_MPWRQ_LOG_STRIDE_SIZE 6  /* >= 6, HW restriction */
-#define MLX5_MPWRQ_LOG_STRIDE_SIZE_CQE_COMPRESS8  /* >= 6, HW 
restriction */
+#define MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev) \
+   (6 + MLX5_CAP_GEN(mdev, cache_line_128byte)) /* HW restriction */
+#define MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, req) \
+   max_t(u32, MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev), req)
+#define MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev)   
MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 6)
+#define MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) 
MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 8)
+
 #define MLX5_MPWRQ_LOG_WQE_SZ  18
 #define MLX5_MPWRQ_WQE_PAGE_ORDER  (MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT > 0 ? \
MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT : 0)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ab6f4d3b8063..1b7fe43ab22b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -89,8 +89,8 @@ static void mlx5e_set_rq_type_params(struct mlx5e_priv *priv, 
u8 rq_type)
MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW;
priv->params.mpwqe_log_stride_sz =
MLX5E_GET_PFLAG(priv, MLX5E_PFLAG_RX_CQE_COMPRESS) ?
-   MLX5_MPWRQ_LOG_STRIDE_SIZE_CQE_COMPRESS :
-   MLX5_MPWRQ_LOG_STRIDE_SIZE;
+   MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(priv->mdev) :
+   MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(priv->mdev);
priv->params.mpwqe_log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ -
priv->params.mpwqe_log_stride_sz;
break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index f7e50ba67f94..c4242a4e8130 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -543,6 +543,12 @@ static int handle_hca_cap(struct mlx5_core_dev *dev)
 
MLX5_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, PAGE_SHIFT - 12);
 
+   if (MLX5_CAP_GEN_MAX(dev, cache_line_128byte))
+   MLX5_SET(cmd_hca_cap,
+set_hca_cap,
+cache_line_128byte,
+cache_line_size() == 128 ? 1 : 0);
+
err = set_caps(dev, set_ctx, set_sz,
   MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index a919dfb920ae..cc8ae860cd45 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -804,10 +804,12 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 reserved_at_150[0xa];
u8 log_max_ra_res_qp[0x6];
 
-   u8 pad_cap[0x1];
+   u8 end_pad[0x1];
u8 cc_query_allowed[0x1];
u8 cc_modify_allowed[0x1];
-   u8 reserved_at_163[0xd];
+   u8 start_pad[0x1];
+   u8 cache_line_128byte[0x1];
+   u8 reserved_at_163[0xb];
u8 gid_table_size[0x10];
 
u8 out_of_seq_cnt[0x1];
-- 
2.11.0

Re: [PATCH v3 net] bpf: add bpf_sk_netns_id() helper

2017-02-06 Thread Daniel Borkmann


On 02/04/2017 04:34 AM, Alexei Starovoitov wrote:
[...]

+BPF_CALL_1(bpf_skb_netns_id, struct sk_buff *, skb)
+{
+   struct net_device *dev = skb->dev;
+
+   if (!dev)
+   return 0;
+   return proc_get_ns_devid_inum(_net(dev)->ns);
+}
+
+static const struct bpf_func_proto bpf_skb_netns_id_proto = {
+   .func   = bpf_skb_netns_id,
+   .gpl_only   = false,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_PTR_TO_CTX,
+};
+
  static const struct bpf_func_proto *
  sk_filter_func_proto(enum bpf_func_id func_id)
  {
@@ -2620,6 +2649,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_trace_printk:
if (capable(CAP_SYS_ADMIN))
return bpf_get_trace_printk_proto();
+   case BPF_FUNC_sk_netns_id:
+   return _skb_netns_id_proto;
default:
return NULL;
}


Btw, I think here's an oversight that would still need to be
fixed. Above would mean that trace printk from unprivileged would
fall through and use _skb_netns_id_proto as proto now instead
of NULL. So BPF_FUNC_sk_netns_id needs to be placed above the
BPF_FUNC_trace_printk case, not in its fall-through path. Looks
like Chenbo in his get_socket_cookie missed this, too. Other than
that BPF bits seem good to me.


@@ -2700,6 +2731,17 @@ xdp_func_proto(enum bpf_func_id func_id)
  }

Re: [PATCH iproute2] ip route: Make name of protocol 0 consistent

2017-02-06 Thread David Ahern

On 2/6/17 3:01 PM, Stephen Hemminger wrote:
> On Thu,  2 Feb 2017 09:22:06 -0800
> David Ahern  wrote:
> 
>> iproute2 can inconsistently show the name of protocol 0 if a route with
>> a custom protocol is added. For example:
>>   dsa@cartman:~$ ip -6 ro ls table all | egrep 'proto none|proto unspec'
>>   local ::1 dev lo  table local  proto none  metric 0  pref medium
>>   local fe80::225:90ff:fecb:1c18 dev lo  table local  proto none  metric 0  
>> pref medium
>>   local fe80::92e2:baff:fe5c:da5d dev lo  table local  proto none  metric 0  
>> pref medium
>>
>> protocol 0 is pretty printed as "none". Add a route with a custom protocol:
>>   dsa@cartman:~$ sudo ip -6 ro add  2001:db8:200::1/128 dev eth0 proto 123
>>
>> And now display has switched from "none" to "unspec":
>>   dsa@cartman:~$ ip -6 ro ls table all | egrep 'proto none|proto unspec'
>>   local ::1 dev lo  table local  proto unspec  metric 0  pref medium
>>   local fe80::225:90ff:fecb:1c18 dev lo  table local  proto unspec  metric 0 
>>  pref medium
>>   local fe80::92e2:baff:fe5c:da5d dev lo  table local  proto unspec  metric 
>> 0  pref medium
>>
>> The rt_protos file has the id to name mapping as "unspec" while
>> rtnl_rtprot_tab[0] has "none". The presence of a custom protocol id
>> triggers reading the rt_protos file and overwriting the string in
>> rtnl_rtprot_tab. All of this is logic from 2004 and earlier.
>>
>> The simplest change to achieve consistency is to update the rt_protos
>> file to use "none" instead of "unspec".
>>
>> Signed-off-by: David Ahern 
>> ---
>>  etc/iproute2/rt_protos | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/etc/iproute2/rt_protos b/etc/iproute2/rt_protos
>> index 82cf9c46cf6f..21af85b9d7e1 100644
>> --- a/etc/iproute2/rt_protos
>> +++ b/etc/iproute2/rt_protos
>> @@ -1,7 +1,7 @@
>>  #
>>  # Reserved protocols.
>>  #
>> -0   unspec
>> +0   none
>>  1   redirect
>>  2   kernel
>>  3   boot
> 
> This doesn't look like a good solution, you loose the value of unspec.
> 
> Just to clarify. You added a custom protocol value to netlink.
> And then you are using upstream iproute2 source to display the value.

no. I am saying the string displayed for protocol '0' is changing. This
is all within iproute2 code and files; it has 2 strings for protocol 0:

lib/rt_names.c:
static char *rtnl_rtprot_tab[256] = {
[RTPROT_UNSPEC]   = "none",

and the rt_protos file above shows "unspec"

The presence of a custom protocol triggers the rt_protos file to be read:

const char *rtnl_rtprot_n2a(int id, char *buf, int len)
{
if (id < 0 || id >= 256) {
snprintf(buf, len, "%u", id);
return buf;
}
if (!rtnl_rtprot_tab[id]) {
if (!rtnl_rtprot_init)
rtnl_rtprot_initialize();


Reading the file changes the string in rtnl_rtprot_tab for
RTPROT_UNSPEC. Both string values -- "none" and "unspec" come from
iproute2, so my point is that string is inconsistent within iproute2.

Re: net/icmp: null-ptr-deref in ping_v4_push_pending_frames

2017-02-06 Thread Florian Westphal

Cong Wang  wrote:
> On Mon, Feb 6, 2017 at 11:39 AM, Andrey Konovalov  
> wrote:
> > Hi,
> >
> > I've got the following error report while running the syzkaller fuzzer.
> >
> > The null-ptr-deref is caused by sendto() on a socket(PF_INET,
> > SOCK_DGRAM, PROT_ICMP).
> > Note, that this requires the ability to create such sockets, which can
> > be configured by net.ipv4.ping_group_range
> > (https://lwn.net/Articles/422330/).
> >
> > A reproducer and .config are attached.
> >
> > On commit a572a1b999489efb591287632279c6c9eca3e4ed.
> >
> > general protection fault:  [#1] SMP KASAN
> > Dumping ftrace buffer:
> >(ftrace buffer empty)
> > Modules linked in:
> > CPU: 2 PID: 3880 Comm: syz-executor1 Not tainted 4.10.0-rc6+ #124
[..]
> 
> This fixes it for me:
> 
> diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
> index 86cca61..68d77b1 100644
> --- a/net/ipv4/ping.c
> +++ b/net/ipv4/ping.c
> @@ -642,6 +642,8 @@ static int ping_v4_push_pending_frames(struct sock
> *sk, struct pingfakehdr *pfh,
>  {
> struct sk_buff *skb = skb_peek(>sk_write_queue);
> 
> +   if (!skb)
> +   return 0;
> pfh->wcheck = csum_partial((char *)>icmph,
> sizeof(struct icmphdr), pfh->wcheck);
> pfh->icmph.checksum = csum_fold(pfh->wcheck);

Sigh.  I wonder if we can remove ping sockets.

IIRC they were born out of a 'no suid' requirement in combination
with 'suid is insecure' but, alas, placing it in kernel evidently
doesn't make things any more secure either.

Those that don't want a suid ping binary could probably convince systemd
developers to provide systemd-icmpd instead with ping dbus interface
(ok, I'll shut up now ;)

[PATCH net-next v2 1/8] bpf: Use bpf_load_program() from the library

2017-02-06 Thread Mickaël Salaün

Replace bpf_prog_load() with bpf_load_program() calls.

Signed-off-by: Mickaël Salaün 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Shuah Khan 
---
 tools/lib/bpf/bpf.c |  9 -
 tools/lib/bpf/bpf.h |  4 ++--
 tools/testing/selftests/bpf/Makefile|  4 +++-
 tools/testing/selftests/bpf/bpf_sys.h   | 21 -
 tools/testing/selftests/bpf/test_tag.c  |  6 --
 tools/testing/selftests/bpf/test_verifier.c |  8 +---
 6 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 3ddb58a36d3c..e96e2a9a7742 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -42,7 +42,7 @@
 # endif
 #endif
 
-static __u64 ptr_to_u64(void *ptr)
+static __u64 ptr_to_u64(const void *ptr)
 {
return (__u64) (unsigned long) ptr;
 }
@@ -69,14 +69,13 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size,
return sys_bpf(BPF_MAP_CREATE, , sizeof(attr));
 }
 
-int bpf_load_program(enum bpf_prog_type type, struct bpf_insn *insns,
-size_t insns_cnt, char *license,
+int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
+size_t insns_cnt, const char *license,
 __u32 kern_version, char *log_buf, size_t log_buf_sz)
 {
int fd;
-   union bpf_attr attr;
+   union bpf_attr attr = {};
 
-   bzero(, sizeof(attr));
attr.prog_type = type;
attr.insn_cnt = (__u32)insns_cnt;
attr.insns = ptr_to_u64(insns);
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index a2f9853dd882..bc959a2de023 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -28,8 +28,8 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size, 
int value_size,
 
 /* Recommend log buffer size */
 #define BPF_LOG_BUF_SIZE 65536
-int bpf_load_program(enum bpf_prog_type type, struct bpf_insn *insns,
-size_t insns_cnt, char *license,
+int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
+size_t insns_cnt, const char *license,
 __u32 kern_version, char *log_buf,
 size_t log_buf_sz);
 
diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index f3d65ad53494..a35f564f66a1 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -1,4 +1,4 @@
-CFLAGS += -Wall -O2 -lcap -I../../../include/uapi
+CFLAGS += -Wall -O2 -lcap -I../../../include/uapi -I../../../lib
 
 test_objs = test_verifier test_tag test_maps test_lru_map test_lpm_map
 
@@ -7,6 +7,8 @@ TEST_FILES := $(test_objs)
 
 all: $(test_objs)
 
+$(test_objs): ../../../lib/bpf/bpf.o
+
 include ../lib.mk
 
 clean:
diff --git a/tools/testing/selftests/bpf/bpf_sys.h 
b/tools/testing/selftests/bpf/bpf_sys.h
index 6b4565f2a3f2..e7bbe3e5402e 100644
--- a/tools/testing/selftests/bpf/bpf_sys.h
+++ b/tools/testing/selftests/bpf/bpf_sys.h
@@ -84,25 +84,4 @@ static inline int bpf_map_create(enum bpf_map_type type, 
uint32_t size_key,
return bpf(BPF_MAP_CREATE, , sizeof(attr));
 }
 
-static inline int bpf_prog_load(enum bpf_prog_type type,
-   const struct bpf_insn *insns, size_t size_insns,
-   const char *license, char *log, size_t size_log)
-{
-   union bpf_attr attr = {};
-
-   attr.prog_type = type;
-   attr.insns = bpf_ptr_to_u64(insns);
-   attr.insn_cnt = size_insns / sizeof(struct bpf_insn);
-   attr.license = bpf_ptr_to_u64(license);
-
-   if (size_log > 0) {
-   attr.log_buf = bpf_ptr_to_u64(log);
-   attr.log_size = size_log;
-   attr.log_level = 1;
-   log[0] = 0;
-   }
-
-   return bpf(BPF_PROG_LOAD, , sizeof(attr));
-}
-
 #endif /* __BPF_SYS__ */
diff --git a/tools/testing/selftests/bpf/test_tag.c 
b/tools/testing/selftests/bpf/test_tag.c
index 5f7c602f47d1..b77dc4b03e77 100644
--- a/tools/testing/selftests/bpf/test_tag.c
+++ b/tools/testing/selftests/bpf/test_tag.c
@@ -16,6 +16,8 @@
 #include 
 #include 
 
+#include 
+
 #include "../../../include/linux/filter.h"
 
 #include "bpf_sys.h"
@@ -55,8 +57,8 @@ static int bpf_try_load_prog(int insns, int fd_map,
int fd_prog;
 
bpf_filler(insns, fd_map);
-   fd_prog = bpf_prog_load(BPF_PROG_TYPE_SCHED_CLS, prog, insns *
-   sizeof(struct bpf_insn), "", NULL, 0);
+   fd_prog = bpf_load_program(BPF_PROG_TYPE_SCHED_CLS, prog, insns, "", 0,
+   NULL, 0);
assert(fd_prog > 0);
if (fd_map > 0)
bpf_filler(insns, 0);
diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index 6a82e7db2c20..62ae4e7a2278 100644
--- a/tools/testing/selftests/bpf/test_verifier.c

[PATCH] net: ethernet: ti: cpsw: remove netif_trans_update

2017-02-06 Thread Ivan Khoronzhuk

No need to update jiffies in txq->trans_start twice, it's supposed to be
done in netdev_start_xmit() and anyway is re-written. Also, no reason to
update trans time in case of an error.

Signed-off-by: Ivan Khoronzhuk 
---
Based on net-next/master

 drivers/net/ethernet/ti/cpsw.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 35a95dc..4d1c0c3 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1601,8 +1601,6 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff 
*skb,
struct cpdma_chan *txch;
int ret, q_idx;
 
-   netif_trans_update(ndev);
-
if (skb_padto(skb, CPSW_MIN_PACKET_SIZE)) {
cpsw_err(priv, tx_err, "packet pad failed\n");
ndev->stats.tx_dropped++;
-- 
2.7.4

[PATCH net-next v2 3/8] bpf: Use bpf_map_lookup_elem() from the library

2017-02-06 Thread Mickaël Salaün

Replace bpf_map_lookup() with bpf_map_lookup_elem() calls.

Signed-off-by: Mickaël Salaün 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Shuah Khan 
---
 tools/lib/bpf/bpf.c|  5 ++---
 tools/lib/bpf/bpf.h|  2 +-
 tools/testing/selftests/bpf/bpf_sys.h  | 11 ---
 tools/testing/selftests/bpf/test_lpm_map.c | 16 
 tools/testing/selftests/bpf/test_lru_map.c | 28 ++--
 tools/testing/selftests/bpf/test_maps.c| 30 +++---
 6 files changed, 40 insertions(+), 52 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index ea3369b50321..81505801fa33 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -110,11 +110,10 @@ int bpf_map_update_elem(int fd, const void *key, const 
void *value,
return sys_bpf(BPF_MAP_UPDATE_ELEM, , sizeof(attr));
 }
 
-int bpf_map_lookup_elem(int fd, void *key, void *value)
+int bpf_map_lookup_elem(int fd, const void *key, void *value)
 {
-   union bpf_attr attr;
+   union bpf_attr attr = {};
 
-   bzero(, sizeof(attr));
attr.map_fd = fd;
attr.key = ptr_to_u64(key);
attr.value = ptr_to_u64(value);
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 2458534c8b33..171cf594f782 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -36,7 +36,7 @@ int bpf_load_program(enum bpf_prog_type type, const struct 
bpf_insn *insns,
 int bpf_map_update_elem(int fd, const void *key, const void *value,
__u64 flags);
 
-int bpf_map_lookup_elem(int fd, void *key, void *value);
+int bpf_map_lookup_elem(int fd, const void *key, void *value);
 int bpf_map_delete_elem(int fd, void *key);
 int bpf_map_get_next_key(int fd, void *key, void *next_key);
 int bpf_obj_pin(int fd, const char *pathname);
diff --git a/tools/testing/selftests/bpf/bpf_sys.h 
b/tools/testing/selftests/bpf/bpf_sys.h
index e08dec0db9e0..0a5a6060db70 100644
--- a/tools/testing/selftests/bpf/bpf_sys.h
+++ b/tools/testing/selftests/bpf/bpf_sys.h
@@ -24,17 +24,6 @@ static inline int bpf(int cmd, union bpf_attr *attr, 
unsigned int size)
 #endif
 }
 
-static inline int bpf_map_lookup(int fd, const void *key, void *value)
-{
-   union bpf_attr attr = {};
-
-   attr.map_fd = fd;
-   attr.key = bpf_ptr_to_u64(key);
-   attr.value = bpf_ptr_to_u64(value);
-
-   return bpf(BPF_MAP_LOOKUP_ELEM, , sizeof(attr));
-}
-
 static inline int bpf_map_delete(int fd, const void *key)
 {
union bpf_attr attr = {};
diff --git a/tools/testing/selftests/bpf/test_lpm_map.c 
b/tools/testing/selftests/bpf/test_lpm_map.c
index e29ffbcd2932..bd08394c26cb 100644
--- a/tools/testing/selftests/bpf/test_lpm_map.c
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -211,7 +211,7 @@ static void test_lpm_map(int keysize)
 
key->prefixlen = 8 * keysize;
memcpy(key->data, data, keysize);
-   r = bpf_map_lookup(map, key, value);
+   r = bpf_map_lookup_elem(map, key, value);
assert(!r || errno == ENOENT);
assert(!t == !!r);
 
@@ -300,32 +300,32 @@ static void test_lpm_ipaddr(void)
 
/* Test some lookups that should come back with a value */
inet_pton(AF_INET, "192.168.128.23", key_ipv4->data);
-   assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, ) == 0);
+   assert(bpf_map_lookup_elem(map_fd_ipv4, key_ipv4, ) == 0);
assert(value == 3);
 
inet_pton(AF_INET, "192.168.0.1", key_ipv4->data);
-   assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, ) == 0);
+   assert(bpf_map_lookup_elem(map_fd_ipv4, key_ipv4, ) == 0);
assert(value == 2);
 
inet_pton(AF_INET6, "2a00:1450:4001:814::", key_ipv6->data);
-   assert(bpf_map_lookup(map_fd_ipv6, key_ipv6, ) == 0);
+   assert(bpf_map_lookup_elem(map_fd_ipv6, key_ipv6, ) == 0);
assert(value == 0xdeadbeef);
 
inet_pton(AF_INET6, "2a00:1450:4001:814::1", key_ipv6->data);
-   assert(bpf_map_lookup(map_fd_ipv6, key_ipv6, ) == 0);
+   assert(bpf_map_lookup_elem(map_fd_ipv6, key_ipv6, ) == 0);
assert(value == 0xdeadbeef);
 
/* Test some lookups that should not match any entry */
inet_pton(AF_INET, "10.0.0.1", key_ipv4->data);
-   assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, ) == -1 &&
+   assert(bpf_map_lookup_elem(map_fd_ipv4, key_ipv4, ) == -1 &&
   errno == ENOENT);
 
inet_pton(AF_INET, "11.11.11.11", key_ipv4->data);
-   assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, ) == -1 &&
+   assert(bpf_map_lookup_elem(map_fd_ipv4, key_ipv4, ) == -1 &&
   errno == ENOENT);
 
inet_pton(AF_INET6, "2a00:::", key_ipv6->data);
-   assert(bpf_map_lookup(map_fd_ipv6, key_ipv6, ) == -1 &&
+   assert(bpf_map_lookup_elem(map_fd_ipv6, key_ipv6, ) == -1 &&
   errno ==

[PATCH net-next v2 2/8] bpf: Use bpf_map_update_elem() from the library

2017-02-06 Thread Mickaël Salaün

Replace bpf_map_update() with bpf_map_update_elem() calls.

Signed-off-by: Mickaël Salaün 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Shuah Khan 
---
 tools/lib/bpf/bpf.c|  5 +--
 tools/lib/bpf/bpf.h|  2 +-
 tools/testing/selftests/bpf/bpf_sys.h  | 13 --
 tools/testing/selftests/bpf/test_lpm_map.c | 15 +++
 tools/testing/selftests/bpf/test_lru_map.c | 65 +++---
 tools/testing/selftests/bpf/test_maps.c| 57 +-
 6 files changed, 73 insertions(+), 84 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index e96e2a9a7742..ea3369b50321 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -97,12 +97,11 @@ int bpf_load_program(enum bpf_prog_type type, const struct 
bpf_insn *insns,
return sys_bpf(BPF_PROG_LOAD, , sizeof(attr));
 }
 
-int bpf_map_update_elem(int fd, void *key, void *value,
+int bpf_map_update_elem(int fd, const void *key, const void *value,
__u64 flags)
 {
-   union bpf_attr attr;
+   union bpf_attr attr = {};
 
-   bzero(, sizeof(attr));
attr.map_fd = fd;
attr.key = ptr_to_u64(key);
attr.value = ptr_to_u64(value);
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index bc959a2de023..2458534c8b33 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -33,7 +33,7 @@ int bpf_load_program(enum bpf_prog_type type, const struct 
bpf_insn *insns,
 __u32 kern_version, char *log_buf,
 size_t log_buf_sz);
 
-int bpf_map_update_elem(int fd, void *key, void *value,
+int bpf_map_update_elem(int fd, const void *key, const void *value,
__u64 flags);
 
 int bpf_map_lookup_elem(int fd, void *key, void *value);
diff --git a/tools/testing/selftests/bpf/bpf_sys.h 
b/tools/testing/selftests/bpf/bpf_sys.h
index e7bbe3e5402e..e08dec0db9e0 100644
--- a/tools/testing/selftests/bpf/bpf_sys.h
+++ b/tools/testing/selftests/bpf/bpf_sys.h
@@ -35,19 +35,6 @@ static inline int bpf_map_lookup(int fd, const void *key, 
void *value)
return bpf(BPF_MAP_LOOKUP_ELEM, , sizeof(attr));
 }
 
-static inline int bpf_map_update(int fd, const void *key, const void *value,
-uint64_t flags)
-{
-   union bpf_attr attr = {};
-
-   attr.map_fd = fd;
-   attr.key = bpf_ptr_to_u64(key);
-   attr.value = bpf_ptr_to_u64(value);
-   attr.flags = flags;
-
-   return bpf(BPF_MAP_UPDATE_ELEM, , sizeof(attr));
-}
-
 static inline int bpf_map_delete(int fd, const void *key)
 {
union bpf_attr attr = {};
diff --git a/tools/testing/selftests/bpf/test_lpm_map.c 
b/tools/testing/selftests/bpf/test_lpm_map.c
index 26775c00273f..e29ffbcd2932 100644
--- a/tools/testing/selftests/bpf/test_lpm_map.c
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 
+#include 
 #include "bpf_sys.h"
 #include "bpf_util.h"
 
@@ -198,7 +199,7 @@ static void test_lpm_map(int keysize)
 
key->prefixlen = value[keysize];
memcpy(key->data, value, keysize);
-   r = bpf_map_update(map, key, value, 0);
+   r = bpf_map_update_elem(map, key, value, 0);
assert(!r);
}
 
@@ -266,32 +267,32 @@ static void test_lpm_ipaddr(void)
value = 1;
key_ipv4->prefixlen = 16;
inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
-   assert(bpf_map_update(map_fd_ipv4, key_ipv4, , 0) == 0);
+   assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, , 0) == 0);
 
value = 2;
key_ipv4->prefixlen = 24;
inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
-   assert(bpf_map_update(map_fd_ipv4, key_ipv4, , 0) == 0);
+   assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, , 0) == 0);
 
value = 3;
key_ipv4->prefixlen = 24;
inet_pton(AF_INET, "192.168.128.0", key_ipv4->data);
-   assert(bpf_map_update(map_fd_ipv4, key_ipv4, , 0) == 0);
+   assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, , 0) == 0);
 
value = 5;
key_ipv4->prefixlen = 24;
inet_pton(AF_INET, "192.168.1.0", key_ipv4->data);
-   assert(bpf_map_update(map_fd_ipv4, key_ipv4, , 0) == 0);
+   assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, , 0) == 0);
 
value = 4;
key_ipv4->prefixlen = 23;
inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
-   assert(bpf_map_update(map_fd_ipv4, key_ipv4, , 0) == 0);
+   assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, , 0) == 0);
 
value = 0xdeadbeef;
key_ipv6->prefixlen = 64;
inet_pton(AF_INET6, "2a00:1450:4001:814::200e", key_ipv6->data);
-   assert(bpf_map_update(map_fd_ipv6, key_ipv6, , 0) == 0);
+   assert(bpf_map_update_elem(map_fd_ipv6, key_ipv6, , 0) == 0);
 
/* Set tprefixlen to maximum for

[PATCH net-next v2 4/8] bpf: Use bpf_map_delete_elem() from the library

2017-02-06 Thread Mickaël Salaün

Replace bpf_map_delete() with bpf_map_delete_elem() calls.

Signed-off-by: Mickaël Salaün 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Shuah Khan 
---
 tools/lib/bpf/bpf.c|  5 ++---
 tools/lib/bpf/bpf.h|  2 +-
 tools/testing/selftests/bpf/bpf_sys.h  | 10 --
 tools/testing/selftests/bpf/test_lru_map.c |  6 +++---
 tools/testing/selftests/bpf/test_maps.c| 22 +++---
 5 files changed, 17 insertions(+), 28 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 81505801fa33..ee3a87de15ba 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -121,11 +121,10 @@ int bpf_map_lookup_elem(int fd, const void *key, void 
*value)
return sys_bpf(BPF_MAP_LOOKUP_ELEM, , sizeof(attr));
 }
 
-int bpf_map_delete_elem(int fd, void *key)
+int bpf_map_delete_elem(int fd, const void *key)
 {
-   union bpf_attr attr;
+   union bpf_attr attr = {};
 
-   bzero(, sizeof(attr));
attr.map_fd = fd;
attr.key = ptr_to_u64(key);
 
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 171cf594f782..f559f648db45 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -37,7 +37,7 @@ int bpf_map_update_elem(int fd, const void *key, const void 
*value,
__u64 flags);
 
 int bpf_map_lookup_elem(int fd, const void *key, void *value);
-int bpf_map_delete_elem(int fd, void *key);
+int bpf_map_delete_elem(int fd, const void *key);
 int bpf_map_get_next_key(int fd, void *key, void *next_key);
 int bpf_obj_pin(int fd, const char *pathname);
 int bpf_obj_get(const char *pathname);
diff --git a/tools/testing/selftests/bpf/bpf_sys.h 
b/tools/testing/selftests/bpf/bpf_sys.h
index 0a5a6060db70..17581a42e1d9 100644
--- a/tools/testing/selftests/bpf/bpf_sys.h
+++ b/tools/testing/selftests/bpf/bpf_sys.h
@@ -24,16 +24,6 @@ static inline int bpf(int cmd, union bpf_attr *attr, 
unsigned int size)
 #endif
 }
 
-static inline int bpf_map_delete(int fd, const void *key)
-{
-   union bpf_attr attr = {};
-
-   attr.map_fd = fd;
-   attr.key = bpf_ptr_to_u64(key);
-
-   return bpf(BPF_MAP_DELETE_ELEM, , sizeof(attr));
-}
-
 static inline int bpf_map_next_key(int fd, const void *key, void *next_key)
 {
union bpf_attr attr = {};
diff --git a/tools/testing/selftests/bpf/test_lru_map.c 
b/tools/testing/selftests/bpf/test_lru_map.c
index 53155009bdb6..d375fac1a49c 100644
--- a/tools/testing/selftests/bpf/test_lru_map.c
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -318,7 +318,7 @@ static void test_lru_sanity2(int map_type, int map_flags, 
unsigned int tgt_free)
key = 1;
if (map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
assert(!bpf_map_update_elem(lru_map_fd, , value, 
BPF_NOEXIST));
-   assert(!bpf_map_delete(lru_map_fd, ));
+   assert(!bpf_map_delete_elem(lru_map_fd, ));
} else {
assert(bpf_map_update_elem(lru_map_fd, , value, BPF_EXIST));
}
@@ -470,8 +470,8 @@ static void test_lru_sanity4(int map_type, int map_flags, 
unsigned int tgt_free)
}
 
for (; key <= 2 * tgt_free; key++) {
-   assert(!bpf_map_delete(lru_map_fd, ));
-   assert(bpf_map_delete(lru_map_fd, ));
+   assert(!bpf_map_delete_elem(lru_map_fd, ));
+   assert(bpf_map_delete_elem(lru_map_fd, ));
}
 
end_key = key + 2 * tgt_free;
diff --git a/tools/testing/selftests/bpf/test_maps.c 
b/tools/testing/selftests/bpf/test_maps.c
index c73a1fbc5bcc..ae22fdc93172 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -85,7 +85,7 @@ static void test_hashmap(int task, void *data)
 
/* Check that key = 0 doesn't exist. */
key = 0;
-   assert(bpf_map_delete(fd, ) == -1 && errno == ENOENT);
+   assert(bpf_map_delete_elem(fd, ) == -1 && errno == ENOENT);
 
/* Iterate over two elements. */
assert(bpf_map_next_key(fd, , _key) == 0 &&
@@ -97,10 +97,10 @@ static void test_hashmap(int task, void *data)
 
/* Delete both elements. */
key = 1;
-   assert(bpf_map_delete(fd, ) == 0);
+   assert(bpf_map_delete_elem(fd, ) == 0);
key = 2;
-   assert(bpf_map_delete(fd, ) == 0);
-   assert(bpf_map_delete(fd, ) == -1 && errno == ENOENT);
+   assert(bpf_map_delete_elem(fd, ) == 0);
+   assert(bpf_map_delete_elem(fd, ) == -1 && errno == ENOENT);
 
key = 0;
/* Check that map is empty. */
@@ -170,7 +170,7 @@ static void test_hashmap_percpu(int task, void *data)
   errno == E2BIG);
 
/* Check that key = 0 doesn't exist. */
-   assert(bpf_map_delete(fd, ) == -1 && errno == ENOENT);
+   assert(bpf_map_delete_elem(fd, ) == -1 && errno == ENOENT);
 
/* Iterate over two elements. */
while (!bpf_map_next_key(fd, ,

[PATCH net-next v2 5/8] bpf: Use bpf_map_get_next_key() from the library

2017-02-06 Thread Mickaël Salaün

Replace bpf_map_next_key() with bpf_map_get_next_key() calls.

Signed-off-by: Mickaël Salaün 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Shuah Khan 
---
 tools/lib/bpf/bpf.c|  5 ++---
 tools/lib/bpf/bpf.h|  2 +-
 tools/testing/selftests/bpf/bpf_sys.h  | 11 --
 tools/testing/selftests/bpf/test_lru_map.c |  2 +-
 tools/testing/selftests/bpf/test_maps.c| 34 +++---
 5 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index ee3a87de15ba..c7c3e2b34403 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -131,11 +131,10 @@ int bpf_map_delete_elem(int fd, const void *key)
return sys_bpf(BPF_MAP_DELETE_ELEM, , sizeof(attr));
 }
 
-int bpf_map_get_next_key(int fd, void *key, void *next_key)
+int bpf_map_get_next_key(int fd, const void *key, void *next_key)
 {
-   union bpf_attr attr;
+   union bpf_attr attr = {};
 
-   bzero(, sizeof(attr));
attr.map_fd = fd;
attr.key = ptr_to_u64(key);
attr.next_key = ptr_to_u64(next_key);
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index f559f648db45..88f07c15423a 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -38,7 +38,7 @@ int bpf_map_update_elem(int fd, const void *key, const void 
*value,
 
 int bpf_map_lookup_elem(int fd, const void *key, void *value);
 int bpf_map_delete_elem(int fd, const void *key);
-int bpf_map_get_next_key(int fd, void *key, void *next_key);
+int bpf_map_get_next_key(int fd, const void *key, void *next_key);
 int bpf_obj_pin(int fd, const char *pathname);
 int bpf_obj_get(const char *pathname);
 int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type);
diff --git a/tools/testing/selftests/bpf/bpf_sys.h 
b/tools/testing/selftests/bpf/bpf_sys.h
index 17581a42e1d9..aeff99f0a411 100644
--- a/tools/testing/selftests/bpf/bpf_sys.h
+++ b/tools/testing/selftests/bpf/bpf_sys.h
@@ -24,17 +24,6 @@ static inline int bpf(int cmd, union bpf_attr *attr, 
unsigned int size)
 #endif
 }
 
-static inline int bpf_map_next_key(int fd, const void *key, void *next_key)
-{
-   union bpf_attr attr = {};
-
-   attr.map_fd = fd;
-   attr.key = bpf_ptr_to_u64(key);
-   attr.next_key = bpf_ptr_to_u64(next_key);
-
-   return bpf(BPF_MAP_GET_NEXT_KEY, , sizeof(attr));
-}
-
 static inline int bpf_map_create(enum bpf_map_type type, uint32_t size_key,
 uint32_t size_value, uint32_t max_elem,
 uint32_t flags)
diff --git a/tools/testing/selftests/bpf/test_lru_map.c 
b/tools/testing/selftests/bpf/test_lru_map.c
index d375fac1a49c..94ecd4c58d75 100644
--- a/tools/testing/selftests/bpf/test_lru_map.c
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -46,7 +46,7 @@ static int map_subset(int map0, int map1)
unsigned long long value0[nr_cpus], value1[nr_cpus];
int ret;
 
-   while (!bpf_map_next_key(map1, _key, _key)) {
+   while (!bpf_map_get_next_key(map1, _key, _key)) {
assert(!bpf_map_lookup_elem(map1, _key, value1));
ret = bpf_map_lookup_elem(map0, _key, value0);
if (ret) {
diff --git a/tools/testing/selftests/bpf/test_maps.c 
b/tools/testing/selftests/bpf/test_maps.c
index ae22fdc93172..c96f9c9661a0 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -88,11 +88,11 @@ static void test_hashmap(int task, void *data)
assert(bpf_map_delete_elem(fd, ) == -1 && errno == ENOENT);
 
/* Iterate over two elements. */
-   assert(bpf_map_next_key(fd, , _key) == 0 &&
+   assert(bpf_map_get_next_key(fd, , _key) == 0 &&
   (next_key == 1 || next_key == 2));
-   assert(bpf_map_next_key(fd, _key, _key) == 0 &&
+   assert(bpf_map_get_next_key(fd, _key, _key) == 0 &&
   (next_key == 1 || next_key == 2));
-   assert(bpf_map_next_key(fd, _key, _key) == -1 &&
+   assert(bpf_map_get_next_key(fd, _key, _key) == -1 &&
   errno == ENOENT);
 
/* Delete both elements. */
@@ -104,7 +104,7 @@ static void test_hashmap(int task, void *data)
 
key = 0;
/* Check that map is empty. */
-   assert(bpf_map_next_key(fd, , _key) == -1 &&
+   assert(bpf_map_get_next_key(fd, , _key) == -1 &&
   errno == ENOENT);
 
close(fd);
@@ -173,7 +173,7 @@ static void test_hashmap_percpu(int task, void *data)
assert(bpf_map_delete_elem(fd, ) == -1 && errno == ENOENT);
 
/* Iterate over two elements. */
-   while (!bpf_map_next_key(fd, , _key)) {
+   while (!bpf_map_get_next_key(fd, , _key)) {
assert((expected_key_mask & next_key) == next_key);
expected_key_mask &= ~next_key;
 
@@ -199,7 +199,7 @@ static void test_hashmap_percpu(int task, void

[PATCH v2] netfilter: xt_hashlimit: Fix integer divide round to zero.

2017-02-06 Thread Alban Browaeys

Diving the divider by the multiplier before applying to the input.
When this would "divide by zero", divide the multiplier by the divider
first then multiply the input by this value.

Currently user2creds outputs zero when input value is bigger than the
number of slices and  lower than scale.
This as then user input is applied an integer divide operation to
a number greater than itself (scale).
That rounds up to zero, then we mulitply zero by the credits slice size.
  iptables -t filter -I INPUT --protocol tcp --match hashlimit
  --hashlimit 40/second --hashlimit-burst 20 --hashlimit-mode srcip
  --hashlimit-name syn-flood --jump RETURN
thus trigger the overflow detection code:
xt_hashlimit: overflow, try lower: 25000/20
(25000 as hashlimit avd and 20 the burst)
Here:
134217 slices of (HZ * CREDITS_PER_JIFFY) size.
50 is user input value
100 is XT_HASHLIMIT_SCALE_v2
gives: 0 as user2creds output
Setting burst to "1" typically solve the issue ...
but setting it to "40" does too !

This is on 32bit arch calling into revision 2 of hashlimit.

Signed-off-by: Alban Browaeys 
---
v2:
- fix missing conditional statement in revision 1 case
.

I removed the code duplication between revision 1 and 2.

v1: https://lkml.org/lkml/2017/2/4/173
---
 net/netfilter/xt_hashlimit.c | 25 +
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 10063408141d..84ad5ab34558 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -463,23 +463,16 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
 /* Precision saver. */
 static u64 user2credits(u64 user, int revision)
 {
-   if (revision == 1) {
-   /* If multiplying would overflow... */
-   if (user > 0x / (HZ*CREDITS_PER_JIFFY_v1))
-   /* Divide first. */
-   return div64_u64(user, XT_HASHLIMIT_SCALE)
-   * HZ * CREDITS_PER_JIFFY_v1;
-
-   return div64_u64(user * HZ * CREDITS_PER_JIFFY_v1,
-XT_HASHLIMIT_SCALE);
-   } else {
-   if (user > 0xULL / (HZ*CREDITS_PER_JIFFY))
-   return div64_u64(user, XT_HASHLIMIT_SCALE_v2)
-   * HZ * CREDITS_PER_JIFFY;
+   u64 scale = (revision == 1) ?
+   XT_HASHLIMIT_SCALE : XT_HASHLIMIT_SCALE_v2;
+   u64 cpj = (revision == 1) ?
+   CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY;
 
-   return div64_u64(user * HZ * CREDITS_PER_JIFFY,
-XT_HASHLIMIT_SCALE_v2);
-   }
+   /* Avoid overflow: divide the constant operands first */
+   if (scale >= HZ * cpj)
+   return div64_u64(user, div64_u64(scale, HZ * cpj));
+
+   return user * div64_u64(HZ * cpj, scale);
 }
 
 static u32 user2credits_byte(u32 user)
-- 
2.11.0

[PATCH net-next v2 8/8] bpf: Add test_tag to .gitignore

2017-02-06 Thread Mickaël Salaün

Signed-off-by: Mickaël Salaün 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Shuah Khan 
---
 tools/testing/selftests/bpf/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/.gitignore 
b/tools/testing/selftests/bpf/.gitignore
index d3b1c9bca407..541d9d7fad5a 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -2,3 +2,4 @@ test_verifier
 test_maps
 test_lru_map
 test_lpm_map
+test_tag
-- 
2.11.0

Re: net/icmp: null-ptr-deref in ping_v4_push_pending_frames

2017-02-06 Thread Cong Wang

On Mon, Feb 6, 2017 at 11:39 AM, Andrey Konovalov  wrote:
> Hi,
>
> I've got the following error report while running the syzkaller fuzzer.
>
> The null-ptr-deref is caused by sendto() on a socket(PF_INET,
> SOCK_DGRAM, PROT_ICMP).
> Note, that this requires the ability to create such sockets, which can
> be configured by net.ipv4.ping_group_range
> (https://lwn.net/Articles/422330/).
>
> A reproducer and .config are attached.
>
> On commit a572a1b999489efb591287632279c6c9eca3e4ed.
>
> general protection fault:  [#1] SMP KASAN
> Dumping ftrace buffer:
>(ftrace buffer empty)
> Modules linked in:
> CPU: 2 PID: 3880 Comm: syz-executor1 Not tainted 4.10.0-rc6+ #124
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> task: 880060048040 task.stack: 880069be8000
> RIP: 0010:ping_v4_push_pending_frames net/ipv4/ping.c:647 [inline]
> RIP: 0010:ping_v4_sendmsg+0x1acd/0x23f0 net/ipv4/ping.c:837
> RSP: 0018:880069bef8b8 EFLAGS: 00010206
> RAX: dc00 RBX: 880069befb90 RCX: 
> RDX: 0018 RSI: 880069befa30 RDI: 00c2
> RBP: 880069befbb8 R08: 0008 R09: 
> R10: 0002 R11:  R12: 880069befab0
> R13: 88006c624a80 R14: 880069befa70 R15: 
> FS:  7f6f7c716700() GS:88006de0() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 004a6f28 CR3: 3a134000 CR4: 06e0
> Call Trace:
>  inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:744
>  sock_sendmsg_nosec net/socket.c:635 [inline]
>  sock_sendmsg+0xca/0x110 net/socket.c:645
>  SYSC_sendto+0x660/0x810 net/socket.c:1687
>  SyS_sendto+0x40/0x50 net/socket.c:1655
>  entry_SYSCALL_64_fastpath+0x1f/0xc2
> RIP: 0033:0x445879
> RSP: 002b:7f6f7c715b58 EFLAGS: 0282 ORIG_RAX: 002c
> RAX: ffda RBX: 0005 RCX: 00445879
> RDX: 0008 RSI: 20001000 RDI: 0005
> RBP: 006e1ca0 R08: 20ed9ff0 R09: 0010
> R10: 2010 R11: 0282 R12: 00708000
> R13:  R14: 7f6f7c7169c0 R15: 7f6f7c716700
> Code: 38 ca 7c 08 84 c9 0f 85 35 03 00 00 49 8d bf c2 00 00 00 66 89
> 83 a2 fe ff ff 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f>
> b6 14 02 48 89 f8 83 e0 07 83 c0 01 38 d0 7c 08 84 d2 0f 85
> RIP: ping_v4_push_pending_frames net/ipv4/ping.c:647 [inline] RSP:
> 880069bef8b8
> RIP: ping_v4_sendmsg+0x1acd/0x23f0 net/ipv4/ping.c:837 RSP: 880069bef8b8
> ---[ end trace 13dad24b243d08a7 ]---

This fixes it for me:

diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 86cca61..68d77b1 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -642,6 +642,8 @@ static int ping_v4_push_pending_frames(struct sock
*sk, struct pingfakehdr *pfh,
 {
struct sk_buff *skb = skb_peek(>sk_write_queue);

+   if (!skb)
+   return 0;
pfh->wcheck = csum_partial((char *)>icmph,
sizeof(struct icmphdr), pfh->wcheck);
pfh->icmph.checksum = csum_fold(pfh->wcheck);

[PATCH net-next v2 7/8] bpf: Remove bpf_sys.h from selftests

2017-02-06 Thread Mickaël Salaün

Add require dependency headers.

Signed-off-by: Mickaël Salaün 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Shuah Khan 
---
 tools/lib/bpf/bpf.c |  6 ++
 tools/testing/selftests/bpf/bpf_sys.h   | 27 ---
 tools/testing/selftests/bpf/test_lpm_map.c  |  1 -
 tools/testing/selftests/bpf/test_lru_map.c  |  1 -
 tools/testing/selftests/bpf/test_maps.c |  1 -
 tools/testing/selftests/bpf/test_tag.c  |  3 +--
 tools/testing/selftests/bpf/test_verifier.c |  3 +--
 7 files changed, 8 insertions(+), 34 deletions(-)
 delete mode 100644 tools/testing/selftests/bpf/bpf_sys.h

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index c7c3e2b34403..06a7d4ed1e11 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -50,7 +50,13 @@ static __u64 ptr_to_u64(const void *ptr)
 static int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr,
   unsigned int size)
 {
+#ifdef __NR_bpf
return syscall(__NR_bpf, cmd, attr, size);
+#else
+   fprintf(stderr, "No bpf syscall, kernel headers too old?\n");
+   errno = ENOSYS;
+   return -1;
+#endif
 }
 
 int bpf_create_map(enum bpf_map_type map_type, int key_size,
diff --git a/tools/testing/selftests/bpf/bpf_sys.h 
b/tools/testing/selftests/bpf/bpf_sys.h
deleted file mode 100644
index aa076a8a07f7..
--- a/tools/testing/selftests/bpf/bpf_sys.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef __BPF_SYS__
-#define __BPF_SYS__
-
-#include 
-#include 
-
-#include 
-
-#include 
-
-static inline __u64 bpf_ptr_to_u64(const void *ptr)
-{
-   return (__u64)(unsigned long) ptr;
-}
-
-static inline int bpf(int cmd, union bpf_attr *attr, unsigned int size)
-{
-#ifdef __NR_bpf
-   return syscall(__NR_bpf, cmd, attr, size);
-#else
-   fprintf(stderr, "No bpf syscall, kernel headers too old?\n");
-   errno = ENOSYS;
-   return -1;
-#endif
-}
-
-#endif /* __BPF_SYS__ */
diff --git a/tools/testing/selftests/bpf/test_lpm_map.c 
b/tools/testing/selftests/bpf/test_lpm_map.c
index 3cc812cac2d7..e97565243d59 100644
--- a/tools/testing/selftests/bpf/test_lpm_map.c
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -23,7 +23,6 @@
 #include 
 
 #include 
-#include "bpf_sys.h"
 #include "bpf_util.h"
 
 struct tlpm_node {
diff --git a/tools/testing/selftests/bpf/test_lru_map.c 
b/tools/testing/selftests/bpf/test_lru_map.c
index 20fa3df20a01..5d9790ee14fb 100644
--- a/tools/testing/selftests/bpf/test_lru_map.c
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -19,7 +19,6 @@
 #include 
 
 #include 
-#include "bpf_sys.h"
 #include "bpf_util.h"
 
 #define LOCAL_FREE_TARGET  (128)
diff --git a/tools/testing/selftests/bpf/test_maps.c 
b/tools/testing/selftests/bpf/test_maps.c
index 050a5205c78d..a0090016372e 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -22,7 +22,6 @@
 #include 
 
 #include 
-#include "bpf_sys.h"
 #include "bpf_util.h"
 
 static int map_flags;
diff --git a/tools/testing/selftests/bpf/test_tag.c 
b/tools/testing/selftests/bpf/test_tag.c
index a76d695ab56e..4c7a1cc73f6c 100644
--- a/tools/testing/selftests/bpf/test_tag.c
+++ b/tools/testing/selftests/bpf/test_tag.c
@@ -1,5 +1,6 @@
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -20,8 +21,6 @@
 
 #include "../../../include/linux/filter.h"
 
-#include "bpf_sys.h"
-
 static struct bpf_insn prog[BPF_MAXINSNS];
 
 static void bpf_gen_imm_prog(unsigned int insns, int fd_map)
diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index 1e3cae650474..24d64e5fec50 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -9,6 +9,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -28,8 +29,6 @@
 
 #include "../../../include/linux/filter.h"
 
-#include "bpf_sys.h"
-
 #ifndef ARRAY_SIZE
 # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #endif
-- 
2.11.0

[PATCH net-next v2 6/8] bpf: Use bpf_create_map() from the library

2017-02-06 Thread Mickaël Salaün

Replace bpf_map_create() with bpf_create_map() calls.

Signed-off-by: Mickaël Salaün 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Shuah Khan 
---
 tools/testing/selftests/bpf/bpf_sys.h   | 15 ---
 tools/testing/selftests/bpf/test_lpm_map.c  |  6 +++---
 tools/testing/selftests/bpf/test_lru_map.c  |  4 ++--
 tools/testing/selftests/bpf/test_maps.c | 14 +++---
 tools/testing/selftests/bpf/test_tag.c  |  2 +-
 tools/testing/selftests/bpf/test_verifier.c |  4 ++--
 6 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/tools/testing/selftests/bpf/bpf_sys.h 
b/tools/testing/selftests/bpf/bpf_sys.h
index aeff99f0a411..aa076a8a07f7 100644
--- a/tools/testing/selftests/bpf/bpf_sys.h
+++ b/tools/testing/selftests/bpf/bpf_sys.h
@@ -24,19 +24,4 @@ static inline int bpf(int cmd, union bpf_attr *attr, 
unsigned int size)
 #endif
 }
 
-static inline int bpf_map_create(enum bpf_map_type type, uint32_t size_key,
-uint32_t size_value, uint32_t max_elem,
-uint32_t flags)
-{
-   union bpf_attr attr = {};
-
-   attr.map_type = type;
-   attr.key_size = size_key;
-   attr.value_size = size_value;
-   attr.max_entries = max_elem;
-   attr.map_flags = flags;
-
-   return bpf(BPF_MAP_CREATE, , sizeof(attr));
-}
-
 #endif /* __BPF_SYS__ */
diff --git a/tools/testing/selftests/bpf/test_lpm_map.c 
b/tools/testing/selftests/bpf/test_lpm_map.c
index bd08394c26cb..3cc812cac2d7 100644
--- a/tools/testing/selftests/bpf/test_lpm_map.c
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -183,7 +183,7 @@ static void test_lpm_map(int keysize)
key = alloca(sizeof(*key) + keysize);
memset(key, 0, sizeof(*key) + keysize);
 
-   map = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE,
+   map = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE,
 sizeof(*key) + keysize,
 keysize + 1,
 4096,
@@ -253,12 +253,12 @@ static void test_lpm_ipaddr(void)
key_ipv4 = alloca(key_size_ipv4);
key_ipv6 = alloca(key_size_ipv6);
 
-   map_fd_ipv4 = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE,
+   map_fd_ipv4 = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE,
 key_size_ipv4, sizeof(value),
 100, BPF_F_NO_PREALLOC);
assert(map_fd_ipv4 >= 0);
 
-   map_fd_ipv6 = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE,
+   map_fd_ipv6 = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE,
 key_size_ipv6, sizeof(value),
 100, BPF_F_NO_PREALLOC);
assert(map_fd_ipv6 >= 0);
diff --git a/tools/testing/selftests/bpf/test_lru_map.c 
b/tools/testing/selftests/bpf/test_lru_map.c
index 94ecd4c58d75..20fa3df20a01 100644
--- a/tools/testing/selftests/bpf/test_lru_map.c
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -31,11 +31,11 @@ static int create_map(int map_type, int map_flags, unsigned 
int size)
 {
int map_fd;
 
-   map_fd = bpf_map_create(map_type, sizeof(unsigned long long),
+   map_fd = bpf_create_map(map_type, sizeof(unsigned long long),
sizeof(unsigned long long), size, map_flags);
 
if (map_fd == -1)
-   perror("bpf_map_create");
+   perror("bpf_create_map");
 
return map_fd;
 }
diff --git a/tools/testing/selftests/bpf/test_maps.c 
b/tools/testing/selftests/bpf/test_maps.c
index c96f9c9661a0..050a5205c78d 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -32,7 +32,7 @@ static void test_hashmap(int task, void *data)
long long key, next_key, value;
int fd;
 
-   fd = bpf_map_create(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+   fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
2, map_flags);
if (fd < 0) {
printf("Failed to create hashmap '%s'!\n", strerror(errno));
@@ -118,7 +118,7 @@ static void test_hashmap_percpu(int task, void *data)
int expected_key_mask = 0;
int fd, i;
 
-   fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_HASH, sizeof(key),
+   fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_HASH, sizeof(key),
sizeof(value[0]), 2, map_flags);
if (fd < 0) {
printf("Failed to create hashmap '%s'!\n", strerror(errno));
@@ -210,7 +210,7 @@ static void test_arraymap(int task, void *data)
int key, next_key, fd;
long long value;
 
-   fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value),
+   fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value),
2, 0);
if (fd < 0) {
printf("Failed to create arraymap '%s'!\n", strerror(errno));
@@ -264,7 +264,7 @@

Re: [PATCH net-next v1 6/7] bpf: Use the bpf_load_program() from the library

2017-02-06 Thread Mickaël Salaün



On 06/02/2017 23:44, Daniel Borkmann wrote:
> On 02/06/2017 10:30 PM, Mickaël Salaün wrote:
>> On 06/02/2017 20:18, Daniel Borkmann wrote:
>>> On 02/06/2017 08:16 PM, Mickaël Salaün wrote:
 On 06/02/2017 16:30, Daniel Borkmann wrote:
> On 02/06/2017 12:14 AM, Mickaël Salaün wrote:
>> Replace bpf_prog_load() with bpf_load_program() calls.
>>
>> Use the tools include directory instead of the installed one to allow
>> builds from other kernels.
>>
>> Signed-off-by: Mickaël Salaün 
>> Cc: Alexei Starovoitov 
>> Cc: Daniel Borkmann 
>> Cc: Shuah Khan 
>> ---
>> tools/testing/selftests/bpf/Makefile|  6 +-
>> tools/testing/selftests/bpf/bpf_sys.h   | 21
>> -
>> tools/testing/selftests/bpf/test_tag.c  |  6 --
>> tools/testing/selftests/bpf/test_verifier.c |  8 +---
>> 4 files changed, 14 insertions(+), 27 deletions(-)
>
> No objections, but if so, can't we add the remaining missing
> pieces to bpf lib, so we can remove bpf_sys.h altogether?

 OK, I'll send a new patch replacing bpf_sys.h entirely.
>>>
>>> Sounds great, thanks!
>>
>> Do you prefer a big patch or one for each replaced function?
> 
> I think it makes sense to split it into two: i) this patch as-is
> for the prog part, and ii) rest for maps.
> 

Hum, I already split them to ease the review. I'm going to send this
series now.



signature.asc
Description: OpenPGP digital signature

Re: [PATCH net-next v2 3/3] bpf: Always test unprivileged programs

2017-02-06 Thread Daniel Borkmann


On 02/06/2017 09:52 PM, Mickaël Salaün wrote:

If selftests are run as root, then execute the unprivileged checks as
well. This switch from 240 to 364 tests.

The test numbers are suffixed with "/u" when executed as unprivileged or
with "/p" when executed as privileged.

The geteuid() check is replaced with a capability check.

Handling capabilities requires the libcap dependency.

Signed-off-by: Mickaël Salaün 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Shuah Khan 


Acked-by: Daniel Borkmann

Re: [PATCH net-next v1 6/7] bpf: Use the bpf_load_program() from the library

2017-02-06 Thread Daniel Borkmann


On 02/06/2017 10:30 PM, Mickaël Salaün wrote:

On 06/02/2017 20:18, Daniel Borkmann wrote:

On 02/06/2017 08:16 PM, Mickaël Salaün wrote:

On 06/02/2017 16:30, Daniel Borkmann wrote:

On 02/06/2017 12:14 AM, Mickaël Salaün wrote:

Replace bpf_prog_load() with bpf_load_program() calls.

Use the tools include directory instead of the installed one to allow
builds from other kernels.

Signed-off-by: Mickaël Salaün 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Shuah Khan 
---
tools/testing/selftests/bpf/Makefile|  6 +-
tools/testing/selftests/bpf/bpf_sys.h   | 21
-
tools/testing/selftests/bpf/test_tag.c  |  6 --
tools/testing/selftests/bpf/test_verifier.c |  8 +---
4 files changed, 14 insertions(+), 27 deletions(-)


No objections, but if so, can't we add the remaining missing
pieces to bpf lib, so we can remove bpf_sys.h altogether?


OK, I'll send a new patch replacing bpf_sys.h entirely.


Sounds great, thanks!


Do you prefer a big patch or one for each replaced function?


I think it makes sense to split it into two: i) this patch as-is
for the prog part, and ii) rest for maps.

Re: [PATCH iproute2/net-next 0/7] tc: flower: Masked ICMP match and ND match

2017-02-06 Thread Stephen Hemminger

On Thu,  2 Feb 2017 11:38:33 +0100
Simon Horman  wrote:

> Hi,
> 
> this series have several related parts.
> 
> * tc: flower: Update documentation to indicate ARP takes IPv4 prefixes
> 
>   Enhance documentation for consistency with later documentation changes.
> 
> * tc: flower: use correct type when calling flower_icmp_attr_type
> 
>   Type correction to ICMP code; should not have runtime effect
> 
> * tc: flower: provide generic masked u8 parser helper
>   tc: flower: provide generic masked u8 print helper
> 
>   Generic parsing and printing of masked u8 options
> 
> * tc: flower: support masked ICMP code and type match
> 
>   Support masking ICMP code and type matches.
>   Unmasked matching is already supported by iproute2
>   Masked matching is already supported by the kernel.
> 
>   This is used by the ND patches
> 
> * tc: flower: Add TCA_FLOWER_KEY_ND_*
>   tc: flower: Support matching on ND
> 
> The last two patches are marked as RFC as they support functionality
> submitted to but not yet not yet present in the kernel.
> 
> 
> Simon Horman (7):
>   tc: flower: Update documentation to indicate ARP takes IPv4 prefixes
>   tc: flower: use correct type when calling flower_icmp_attr_type
>   tc: flower: provide generic masked u8 parser helper
>   tc: flower: provide generic masked u8 print helper
>   tc: flower: support masked ICMP code and type match
>   tc: flower: Add TCA_FLOWER_KEY_ND_*
>   tc: flower: Support matching on ND
> 
>  include/linux/pkt_cls.h |   7 ++
>  man/man8/tc-flower.8|  58 +--
>  tc/f_flower.c   | 260 
> +---
>  3 files changed, 258 insertions(+), 67 deletions(-)
> 

Since this patchset depended on changes to pkt_cls.h which are not accepted 
upstream
into net-next, I marked it as awaiting upstream.  When the corresponding kernel 
changes
are accepted into net-next please resubmit it.

Re: [PATCH iproute2/net-next 0/3] Add the tc-sample action

2017-02-06 Thread Stephen Hemminger

On Sun,  5 Feb 2017 09:58:51 +0200
Yotam Gigi  wrote:

> This patchset adds the tc-sample action support and the corresponding man
> page. More information about the action and its usage can be found in the
> commit message.
> 
> Yotam Gigi (3):
>   tc: Add support for the sample tc action
>   tc: man: Add man entry for the tc-sample action
>   tc: man: matchall: Update examples to include sample
> 
>  bash-completion/tc   |   8 +-
>  include/linux/tc_act/tc_sample.h |  26 ++
>  man/man8/Makefile|   2 +-
>  man/man8/tc-matchall.8   |  10 +++
>  man/man8/tc-sample.8 | 125 ++
>  tc/Makefile  |   1 +
>  tc/m_sample.c| 186 
> +++
>  7 files changed, 356 insertions(+), 2 deletions(-)
>  create mode 100644 include/linux/tc_act/tc_sample.h
>  create mode 100644 man/man8/tc-sample.8
>  create mode 100644 tc/m_sample.c
> 

Applied to net-next. I had to fix merge conflict on tc/Makefile

Re: [PATCH net-next/iproute 2/5] tc: bash-completion: Prepare action autocomplete to support several actions

2017-02-06 Thread Stephen Hemminger

On Mon,  6 Feb 2017 15:19:21 +0200
Yotam Gigi  wrote:

> The action autocomplete routine (_tc_action_options) currently does not
> support several actions statements in one tc command line as it uses the
> _tc_once_attr and _tc_one_from_list.
> 
> For example, in that case:
> 
> $ tc filter add dev eth0 handle : u32 [...]  \
>  action sample group 5 rate 12 \
>  action sample 
> 
> the _tc_once_attr function, when invoked with "group rate" will not
> suggest those as they already exist on the command line.
> 
> Fix the function to use the _from variant, thus allowing each action
> autocomplete start from the action keyword, and not from the beginning of
> the command line.
> 
> Signed-off-by: Yotam Gigi 

This patch does not apply cleanly to current iproute2 net-next tree.
Please rebase and resubmit the whole series again.


$ cat bash-completion/tc.rej 
--- bash-completion/tc
+++ bash-completion/tc
@@ -454,26 +454,28 @@ _tc_filter_options()
 # Returns 0 is completion should stop after running this function, 1 otherwise.
 _tc_action_options()
 {
-case $1 in
+local from=$1
+local action=${words[from]}
+case $action in
 bpf)
 _tc_bpf_options
 return 0
 ;;
 mirred)
-_tc_one_of_list 'ingress egress'
-_tc_one_of_list 'mirror redirect'
-_tc_once_attr 'index dev'
+_tc_one_of_list_from $from 'ingress egress'
+_tc_one_of_list_from $from 'mirror redirect'
+_tc_once_attr_from $from 'index dev'
 return 0
 ;;
 sample)
-_tc_once_attr 'rate'
-_tc_once_attr 'trunc'
-_tc_once_attr 'group'
+_tc_once_attr_from $from 'rate'
+_tc_once_attr_from $from 'trunc'
+_tc_once_attr_from $from 'group'
 return 0
 ;;
 gact)
-_tc_one_of_list 'reclassify drop continue pass'
-_tc_once_attr 'random'
+_tc_one_of_list_from $from 'reclassify drop continue pass'
+_tc_once_attr_from $from 'random'
 return 0
 ;;
 esac

Re: [PATCH iproute2] ip route: Make name of protocol 0 consistent

2017-02-06 Thread Stephen Hemminger

On Thu,  2 Feb 2017 09:22:06 -0800
David Ahern  wrote:

> iproute2 can inconsistently show the name of protocol 0 if a route with
> a custom protocol is added. For example:
>   dsa@cartman:~$ ip -6 ro ls table all | egrep 'proto none|proto unspec'
>   local ::1 dev lo  table local  proto none  metric 0  pref medium
>   local fe80::225:90ff:fecb:1c18 dev lo  table local  proto none  metric 0  
> pref medium
>   local fe80::92e2:baff:fe5c:da5d dev lo  table local  proto none  metric 0  
> pref medium
> 
> protocol 0 is pretty printed as "none". Add a route with a custom protocol:
>   dsa@cartman:~$ sudo ip -6 ro add  2001:db8:200::1/128 dev eth0 proto 123
> 
> And now display has switched from "none" to "unspec":
>   dsa@cartman:~$ ip -6 ro ls table all | egrep 'proto none|proto unspec'
>   local ::1 dev lo  table local  proto unspec  metric 0  pref medium
>   local fe80::225:90ff:fecb:1c18 dev lo  table local  proto unspec  metric 0  
> pref medium
>   local fe80::92e2:baff:fe5c:da5d dev lo  table local  proto unspec  metric 0 
>  pref medium
> 
> The rt_protos file has the id to name mapping as "unspec" while
> rtnl_rtprot_tab[0] has "none". The presence of a custom protocol id
> triggers reading the rt_protos file and overwriting the string in
> rtnl_rtprot_tab. All of this is logic from 2004 and earlier.
> 
> The simplest change to achieve consistency is to update the rt_protos
> file to use "none" instead of "unspec".
> 
> Signed-off-by: David Ahern 
> ---
>  etc/iproute2/rt_protos | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/etc/iproute2/rt_protos b/etc/iproute2/rt_protos
> index 82cf9c46cf6f..21af85b9d7e1 100644
> --- a/etc/iproute2/rt_protos
> +++ b/etc/iproute2/rt_protos
> @@ -1,7 +1,7 @@
>  #
>  # Reserved protocols.
>  #
> -0unspec
> +0none
>  1redirect
>  2kernel
>  3boot

This doesn't look like a good solution, you loose the value of unspec.

Just to clarify. You added a custom protocol value to netlink.
And then you are using upstream iproute2 source to display the value.

The correct behavior in that case would be for upstream ip route show command 
to display
a numeric value (rather than a symbolic name).

But if you are shipping your own version of iproute then add an additional entry
to rt_protos with your new name, and for sanity update the local copy of 
rtnetlink.h

Of course, submitting your custom protocol upstream is the best long term 
solution.

[PATCH net-next v3 02/12] bnxt_en: Don't use DEFINE_DMA_UNMAP_ADDR to store DMA address in RX path.

2017-02-06 Thread Michael Chan

To support XDP_TX, we need the RX buffer's DMA address to transmit the
packet.  Convert the DMA address field to a permanent field in
bnxt_sw_rx_bd.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 21 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  2 +-
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 94f3d78..43eb52ce 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -608,7 +608,7 @@ static inline int bnxt_alloc_rx_data(struct bnxt *bp,
 
rx_buf->data = data;
rx_buf->data_ptr = data + BNXT_RX_OFFSET;
-   dma_unmap_addr_set(rx_buf, mapping, mapping);
+   rx_buf->mapping = mapping;
 
rxbd->rx_bd_haddr = cpu_to_le64(mapping);
 
@@ -628,8 +628,7 @@ static void bnxt_reuse_rx_data(struct bnxt_rx_ring_info 
*rxr, u16 cons,
prod_rx_buf->data = data;
prod_rx_buf->data_ptr = cons_rx_buf->data_ptr;
 
-   dma_unmap_addr_set(prod_rx_buf, mapping,
-  dma_unmap_addr(cons_rx_buf, mapping));
+   prod_rx_buf->mapping = cons_rx_buf->mapping;
 
prod_bd = >rx_desc_ring[RX_RING(prod)][RX_IDX(prod)];
cons_bd = >rx_desc_ring[RX_RING(cons)][RX_IDX(cons)];
@@ -816,7 +815,7 @@ static struct sk_buff *bnxt_rx_pages(struct bnxt *bp, 
struct bnxt_napi *bnapi,
 * a sw_prod index that equals the cons index, so we
 * need to clear the cons entry now.
 */
-   mapping = dma_unmap_addr(cons_rx_buf, mapping);
+   mapping = cons_rx_buf->mapping;
page = cons_rx_buf->page;
cons_rx_buf->page = NULL;
 
@@ -959,7 +958,7 @@ static void bnxt_tpa_start(struct bnxt *bp, struct 
bnxt_rx_ring_info *rxr,
prod_rx_buf->data_ptr = tpa_info->data_ptr;
 
mapping = tpa_info->mapping;
-   dma_unmap_addr_set(prod_rx_buf, mapping, mapping);
+   prod_rx_buf->mapping = mapping;
 
prod_bd = >rx_desc_ring[RX_RING(prod)][RX_IDX(prod)];
 
@@ -968,7 +967,7 @@ static void bnxt_tpa_start(struct bnxt *bp, struct 
bnxt_rx_ring_info *rxr,
tpa_info->data = cons_rx_buf->data;
tpa_info->data_ptr = cons_rx_buf->data_ptr;
cons_rx_buf->data = NULL;
-   tpa_info->mapping = dma_unmap_addr(cons_rx_buf, mapping);
+   tpa_info->mapping = cons_rx_buf->mapping;
 
tpa_info->len =
le32_to_cpu(tpa_start->rx_tpa_start_cmp_len_flags_type) >>
@@ -1405,7 +1404,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi 
*bnapi, u32 *raw_cons,
}
 
len = le32_to_cpu(rxcmp->rx_cmp_len_flags_type) >> RX_CMP_LEN_SHIFT;
-   dma_addr = dma_unmap_addr(rx_buf, mapping);
+   dma_addr = rx_buf->mapping;
 
if (len <= bp->rx_copy_thresh) {
skb = bnxt_copy_skb(bnapi, data_ptr, len, dma_addr);
@@ -1881,7 +1880,7 @@ static void bnxt_free_rx_skbs(struct bnxt *bp)
 
dma_unmap_single(
>dev,
-   dma_unmap_addr(tpa_info, mapping),
+   tpa_info->mapping,
bp->rx_buf_use_size,
PCI_DMA_FROMDEVICE);
 
@@ -1898,8 +1897,7 @@ static void bnxt_free_rx_skbs(struct bnxt *bp)
if (!data)
continue;
 
-   dma_unmap_single(>dev,
-dma_unmap_addr(rx_buf, mapping),
+   dma_unmap_single(>dev, rx_buf->mapping,
 bp->rx_buf_use_size,
 PCI_DMA_FROMDEVICE);
 
@@ -1916,8 +1914,7 @@ static void bnxt_free_rx_skbs(struct bnxt *bp)
if (!page)
continue;
 
-   dma_unmap_page(>dev,
-  dma_unmap_addr(rx_agg_buf, mapping),
+   dma_unmap_page(>dev, rx_agg_buf->mapping,
   BNXT_RX_PAGE_SIZE, PCI_DMA_FROMDEVICE);
 
rx_agg_buf->page = NULL;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 11b5a4a..b4a04a3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -518,7 +518,7 @@ struct bnxt_sw_tx_bd {
 struct bnxt_sw_rx_bd {
void*data;
u8  *data_ptr;
-   DEFINE_DMA_UNMAP_ADDR(mapping);
+   dma_addr_t  mapping;
 };
 
 struct bnxt_sw_rx_agg_bd {
-- 
1.8.3.1

[PATCH net-next v3 04/12] bnxt_en: Parameterize RX buffer offsets.

2017-02-06 Thread Michael Chan

Convert the global constants BNXT_RX_OFFSET and BNXT_RX_DMA_OFFSET to
device parameters.  This will make it easier to support XDP with
headroom support which requires different RX buffer offsets.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 15 +--
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  2 ++
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index ee2cada..c0f2167 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -583,7 +583,7 @@ static inline u8 *__bnxt_alloc_rx_data(struct bnxt *bp, 
dma_addr_t *mapping,
if (!data)
return NULL;
 
-   *mapping = dma_map_single(>dev, data + BNXT_RX_DMA_OFFSET,
+   *mapping = dma_map_single(>dev, data + bp->rx_dma_offset,
  bp->rx_buf_use_size, bp->rx_dir);
 
if (dma_mapping_error(>dev, *mapping)) {
@@ -607,7 +607,7 @@ static inline int bnxt_alloc_rx_data(struct bnxt *bp,
return -ENOMEM;
 
rx_buf->data = data;
-   rx_buf->data_ptr = data + BNXT_RX_OFFSET;
+   rx_buf->data_ptr = data + bp->rx_offset;
rx_buf->mapping = mapping;
 
rxbd->rx_bd_haddr = cpu_to_le64(mapping);
@@ -778,7 +778,7 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp,
return NULL;
}
 
-   skb_reserve(skb, BNXT_RX_OFFSET);
+   skb_reserve(skb, bp->rx_offset);
skb_put(skb, offset_and_len & 0x);
return skb;
 }
@@ -1255,7 +1255,7 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt 
*bp,
}
 
tpa_info->data = new_data;
-   tpa_info->data_ptr = new_data + BNXT_RX_OFFSET;
+   tpa_info->data_ptr = new_data + bp->rx_offset;
tpa_info->mapping = new_mapping;
 
skb = build_skb(data, 0);
@@ -1267,7 +1267,7 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt 
*bp,
bnxt_abort_tpa(bp, bnapi, cp_cons, agg_bufs);
return NULL;
}
-   skb_reserve(skb, BNXT_RX_OFFSET);
+   skb_reserve(skb, bp->rx_offset);
skb_put(skb, len);
}
 
@@ -2332,7 +2332,7 @@ static int bnxt_init_one_rx_ring(struct bnxt *bp, int 
ring_nr)
return -ENOMEM;
 
rxr->rx_tpa[i].data = data;
-   rxr->rx_tpa[i].data_ptr = data + BNXT_RX_OFFSET;
+   rxr->rx_tpa[i].data_ptr = data + bp->rx_offset;
rxr->rx_tpa[i].mapping = mapping;
}
} else {
@@ -2348,6 +2348,9 @@ static int bnxt_init_rx_rings(struct bnxt *bp)
 {
int i, rc = 0;
 
+   bp->rx_offset = BNXT_RX_OFFSET;
+   bp->rx_dma_offset = BNXT_RX_DMA_OFFSET;
+
for (i = 0; i < bp->rx_nr_rings; i++) {
rc = bnxt_init_one_rx_ring(bp, i);
if (rc)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 353b6d1..d8fc871 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -997,6 +997,8 @@ struct bnxt {
 
u32 rx_buf_size;
u32 rx_buf_use_size;/* useable size */
+   u16 rx_offset;
+   u16 rx_dma_offset;
enum dma_data_direction rx_dir;
u32 rx_ring_size;
u32 rx_agg_ring_size;
-- 
1.8.3.1

[PATCH net-next v3 12/12] bnxt_en: Add support for XDP_TX action.

2017-02-06 Thread Michael Chan

Add dedicated transmit function and transmit completion handler for
XDP.  The XDP transmit logic and completion logic are different than
regular TX ring.  The TX buffer is recycled back to the RX ring when
it completes.

v3: Improved the buffer recyling scheme for XDP_TX.

v2: Add trace_xdp_exception().
Add dma_sync.

Signed-off-by: Michael Chan 
Tested-by: Andy Gospodarek 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 36 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.h | 19 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 83 +++
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h |  1 +
 4 files changed, 122 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 665fe4f..cda1c78 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -212,16 +212,7 @@ static bool bnxt_vf_pciid(enum board_idx idx)
 #define BNXT_CP_DB_IRQ_DIS(db) \
writel(DB_CP_IRQ_DIS_FLAGS, db)
 
-static inline u32 bnxt_tx_avail(struct bnxt *bp, struct bnxt_tx_ring_info *txr)
-{
-   /* Tell compiler to fetch tx indices from memory. */
-   barrier();
-
-   return bp->tx_ring_size -
-   ((txr->tx_prod - txr->tx_cons) & bp->tx_ring_mask);
-}
-
-static const u16 bnxt_lhint_arr[] = {
+const u16 bnxt_lhint_arr[] = {
TX_BD_FLAGS_LHINT_512_AND_SMALLER,
TX_BD_FLAGS_LHINT_512_TO_1023,
TX_BD_FLAGS_LHINT_1024_TO_2047,
@@ -613,9 +604,8 @@ static inline u8 *__bnxt_alloc_rx_data(struct bnxt *bp, 
dma_addr_t *mapping,
return data;
 }
 
-static inline int bnxt_alloc_rx_data(struct bnxt *bp,
-struct bnxt_rx_ring_info *rxr,
-u16 prod, gfp_t gfp)
+int bnxt_alloc_rx_data(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
+  u16 prod, gfp_t gfp)
 {
struct rx_bd *rxbd = >rx_desc_ring[RX_RING(prod)][RX_IDX(prod)];
struct bnxt_sw_rx_bd *rx_buf = >rx_buf_ring[prod];
@@ -1766,6 +1756,18 @@ static int bnxt_poll_work(struct bnxt *bp, struct 
bnxt_napi *bnapi, int budget)
break;
}
 
+   if (event & BNXT_TX_EVENT) {
+   struct bnxt_tx_ring_info *txr = bnapi->tx_ring;
+   void __iomem *db = txr->tx_doorbell;
+   u16 prod = txr->tx_prod;
+
+   /* Sync BD data before updating doorbell */
+   wmb();
+
+   writel(DB_KEY_TX | prod, db);
+   writel(DB_KEY_TX | prod, db);
+   }
+
cpr->cp_raw_cons = raw_cons;
/* ACK completion ring before freeing tx ring and producing new
 * buffers in rx/agg rings to prevent overflowing the completion
@@ -3066,12 +3068,14 @@ static int bnxt_alloc_mem(struct bnxt *bp, bool 
irq_re_init)
bp->tx_ring[i].bnapi = bp->bnapi[j];
bp->bnapi[j]->tx_ring = >tx_ring[i];
bp->tx_ring_map[i] = bp->tx_nr_rings_xdp + i;
-   if (i >= bp->tx_nr_rings_xdp)
+   if (i >= bp->tx_nr_rings_xdp) {
bp->tx_ring[i].txq_index = i -
bp->tx_nr_rings_xdp;
-   else
+   bp->bnapi[j]->tx_int = bnxt_tx_int;
+   } else {
bp->bnapi[j]->flags |= BNXT_NAPI_FLAG_XDP;
-   bp->bnapi[j]->tx_int = bnxt_tx_int;
+   bp->bnapi[j]->tx_int = bnxt_tx_int_xdp;
+   }
}
 
rc = bnxt_alloc_stats(bp);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index db4a410..9f07b9c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -514,13 +514,17 @@ struct rx_tpa_end_cmp_ext {
 
 #define BNXT_RX_EVENT  1
 #define BNXT_AGG_EVENT 2
+#define BNXT_TX_EVENT  4
 
 struct bnxt_sw_tx_bd {
struct sk_buff  *skb;
DEFINE_DMA_UNMAP_ADDR(mapping);
u8  is_gso;
u8  is_push;
-   unsigned short  nr_frags;
+   union {
+   unsigned short  nr_frags;
+   u16 rx_prod;
+   };
 };
 
 struct bnxt_sw_rx_bd {
@@ -1191,6 +1195,19 @@ struct bnxt {
 #define SFF_MODULE_ID_QSFP28   0x11
 #define BNXT_MAX_PHY_I2C_RESP_SIZE 64
 
+static inline u32 bnxt_tx_avail(struct bnxt *bp, struct bnxt_tx_ring_info *txr)
+{
+   /* Tell compiler to fetch tx indices from memory. */
+   barrier();
+
+   return bp->tx_ring_size -
+   ((txr->tx_prod - txr->tx_cons) & bp->tx_ring_mask);
+}
+
+extern

[PATCH net-next v3 05/12] bnxt_en: Add RX page mode support.

2017-02-06 Thread Michael Chan

This mode is to support XDP.  In this mode, each rx ring is configured
with page sized buffers for linear placement of each packet.  MTU will be
restricted to what the page sized buffers can support.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 135 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |   7 ++
 2 files changed, 124 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index c0f2167..0bcd465 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -573,6 +573,25 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi 
*bnapi, int nr_pkts)
}
 }
 
+static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping,
+gfp_t gfp)
+{
+   struct device *dev = >pdev->dev;
+   struct page *page;
+
+   page = alloc_page(gfp);
+   if (!page)
+   return NULL;
+
+   *mapping = dma_map_page(dev, page, 0, PAGE_SIZE, bp->rx_dir);
+   if (dma_mapping_error(dev, *mapping)) {
+   __free_page(page);
+   return NULL;
+   }
+   *mapping += bp->rx_dma_offset;
+   return page;
+}
+
 static inline u8 *__bnxt_alloc_rx_data(struct bnxt *bp, dma_addr_t *mapping,
   gfp_t gfp)
 {
@@ -599,19 +618,28 @@ static inline int bnxt_alloc_rx_data(struct bnxt *bp,
 {
struct rx_bd *rxbd = >rx_desc_ring[RX_RING(prod)][RX_IDX(prod)];
struct bnxt_sw_rx_bd *rx_buf = >rx_buf_ring[prod];
-   u8 *data;
dma_addr_t mapping;
 
-   data = __bnxt_alloc_rx_data(bp, , gfp);
-   if (!data)
-   return -ENOMEM;
+   if (BNXT_RX_PAGE_MODE(bp)) {
+   struct page *page = __bnxt_alloc_rx_page(bp, , gfp);
 
-   rx_buf->data = data;
-   rx_buf->data_ptr = data + bp->rx_offset;
+   if (!page)
+   return -ENOMEM;
+
+   rx_buf->data = page;
+   rx_buf->data_ptr = page_address(page) + bp->rx_offset;
+   } else {
+   u8 *data = __bnxt_alloc_rx_data(bp, , gfp);
+
+   if (!data)
+   return -ENOMEM;
+
+   rx_buf->data = data;
+   rx_buf->data_ptr = data + bp->rx_offset;
+   }
rx_buf->mapping = mapping;
 
rxbd->rx_bd_haddr = cpu_to_le64(mapping);
-
return 0;
 }
 
@@ -754,6 +782,51 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_napi 
*bnapi, u16 cp_cons,
rxr->rx_sw_agg_prod = sw_prod;
 }
 
+static struct sk_buff *bnxt_rx_page_skb(struct bnxt *bp,
+   struct bnxt_rx_ring_info *rxr,
+   u16 cons, void *data, u8 *data_ptr,
+   dma_addr_t dma_addr,
+   unsigned int offset_and_len)
+{
+   unsigned int payload = offset_and_len >> 16;
+   unsigned int len = offset_and_len & 0x;
+   struct skb_frag_struct *frag;
+   struct page *page = data;
+   u16 prod = rxr->rx_prod;
+   struct sk_buff *skb;
+   int off, err;
+
+   err = bnxt_alloc_rx_data(bp, rxr, prod, GFP_ATOMIC);
+   if (unlikely(err)) {
+   bnxt_reuse_rx_data(rxr, cons, data);
+   return NULL;
+   }
+   dma_addr -= bp->rx_dma_offset;
+   dma_unmap_page(>pdev->dev, dma_addr, PAGE_SIZE, bp->rx_dir);
+
+   if (unlikely(!payload))
+   payload = eth_get_headlen(data_ptr, len);
+
+   skb = napi_alloc_skb(>bnapi->napi, payload);
+   if (!skb) {
+   __free_page(page);
+   return NULL;
+   }
+
+   off = (void *)data_ptr - page_address(page);
+   skb_add_rx_frag(skb, 0, page, off, len, PAGE_SIZE);
+   memcpy(skb->data - NET_IP_ALIGN, data_ptr - NET_IP_ALIGN,
+  payload + NET_IP_ALIGN);
+
+   frag = _shinfo(skb)->frags[0];
+   skb_frag_size_sub(frag, payload);
+   frag->page_offset += payload;
+   skb->data_len -= payload;
+   skb->tail += payload;
+
+   return skb;
+}
+
 static struct sk_buff *bnxt_rx_skb(struct bnxt *bp,
   struct bnxt_rx_ring_info *rxr, u16 cons,
   void *data, u8 *data_ptr,
@@ -1329,6 +1402,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi 
*bnapi, u32 *raw_cons,
struct sk_buff *skb;
void *data;
int rc = 0;
+   u32 misc;
 
rxcmp = (struct rx_cmp *)
>cp_desc_ring[CP_RING(cp_cons)][CP_IDX(cp_cons)];
@@ -1381,8 +1455,8 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi 
*bnapi, u32 *raw_cons,
}
prefetch(data_ptr);
 
-   agg_bufs = (le32_to_cpu(rxcmp->rx_cmp_misc_v1) & RX_CMP_AGG_BUFS) >>
-

[PATCH net-next v3 10/12] bnxt_en: Refactor tx completion path.

2017-02-06 Thread Michael Chan

XDP_TX requires a different function to handle completion.  Add a
function pointer to handle tx completion logic.  Regular TX rings
will be assigned the current bnxt_tx_int() for the ->tx_int()
function pointer.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 -
 drivers/net/ethernet/broadcom/bnxt/bnxt.h | 5 +
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 412a8de..64dc94d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -1765,7 +1765,7 @@ static int bnxt_poll_work(struct bnxt *bp, struct 
bnxt_napi *bnapi, int budget)
BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
 
if (tx_pkts)
-   bnxt_tx_int(bp, bnapi, tx_pkts);
+   bnapi->tx_int(bp, bnapi, tx_pkts);
 
if (event & BNXT_RX_EVENT) {
struct bnxt_rx_ring_info *rxr = bnapi->rx_ring;
@@ -3048,6 +3048,9 @@ static int bnxt_alloc_mem(struct bnxt *bp, bool 
irq_re_init)
if (i >= bp->tx_nr_rings_xdp)
bp->tx_ring[i].txq_index = i -
bp->tx_nr_rings_xdp;
+   else
+   bp->bnapi[j]->flags |= BNXT_NAPI_FLAG_XDP;
+   bp->bnapi[j]->tx_int = bnxt_tx_int;
}
 
rc = bnxt_alloc_stats(bp);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 4c3289a..6dc43f5 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -664,6 +664,11 @@ struct bnxt_napi {
struct bnxt_rx_ring_info*rx_ring;
struct bnxt_tx_ring_info*tx_ring;
 
+   void(*tx_int)(struct bnxt *, struct bnxt_napi *,
+ int);
+   u32 flags;
+#define BNXT_NAPI_FLAG_XDP 0x1
+
boolin_reset;
 };
 
-- 
1.8.3.1

[PATCH net-next v3 08/12] bnxt_en: Add tx ring mapping logic.

2017-02-06 Thread Michael Chan

To support XDP_TX, we need to add a set of dedicated TX rings, each
associated with the NAPI of an RX ring.  To assign XDP rings and regular
rings in a flexible way, we add a bp->tx_ring_map[] array to do the
remapping.  The netdev txq index is stored in the new field txq_index
so that we can retrieve the netdev txq when handling TX completions.
In this patch, before we introduce XDP_TX, the mapping is 1:1.

v2: Fixed a bug in bnxt_tx_int().

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 15 ---
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  2 ++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1b051f9..811bc82 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -262,8 +262,8 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, 
struct net_device *dev)
return NETDEV_TX_OK;
}
 
-   txr = >tx_ring[i];
txq = netdev_get_tx_queue(dev, i);
+   txr = >tx_ring[bp->tx_ring_map[i]];
prod = txr->tx_prod;
 
free_size = bnxt_tx_avail(bp, txr);
@@ -509,8 +509,7 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, 
struct net_device *dev)
 static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
 {
struct bnxt_tx_ring_info *txr = bnapi->tx_ring;
-   int index = txr - >tx_ring[0];
-   struct netdev_queue *txq = netdev_get_tx_queue(bp->dev, index);
+   struct netdev_queue *txq = netdev_get_tx_queue(bp->dev, txr->txq_index);
u16 cons = txr->tx_cons;
struct pci_dev *pdev = bp->pdev;
int i;
@@ -2975,6 +2974,8 @@ static void bnxt_free_mem(struct bnxt *bp, bool 
irq_re_init)
bnxt_free_stats(bp);
bnxt_free_ring_grps(bp);
bnxt_free_vnics(bp);
+   kfree(bp->tx_ring_map);
+   bp->tx_ring_map = NULL;
kfree(bp->tx_ring);
bp->tx_ring = NULL;
kfree(bp->rx_ring);
@@ -3027,6 +3028,12 @@ static int bnxt_alloc_mem(struct bnxt *bp, bool 
irq_re_init)
if (!bp->tx_ring)
return -ENOMEM;
 
+   bp->tx_ring_map = kcalloc(bp->tx_nr_rings, sizeof(u16),
+ GFP_KERNEL);
+
+   if (!bp->tx_ring_map)
+   return -ENOMEM;
+
if (bp->flags & BNXT_FLAG_SHARED_RINGS)
j = 0;
else
@@ -3035,6 +3042,8 @@ static int bnxt_alloc_mem(struct bnxt *bp, bool 
irq_re_init)
for (i = 0; i < bp->tx_nr_rings; i++, j++) {
bp->tx_ring[i].bnapi = bp->bnapi[j];
bp->bnapi[j]->tx_ring = >tx_ring[i];
+   bp->tx_ring_map[i] = i;
+   bp->tx_ring[i].txq_index = i;
}
 
rc = bnxt_alloc_stats(bp);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 3a079b8..97ccce1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -566,6 +566,7 @@ struct bnxt_tx_ring_info {
struct bnxt_napi*bnapi;
u16 tx_prod;
u16 tx_cons;
+   u16 txq_index;
void __iomem*tx_doorbell;
 
struct tx_bd*tx_desc_ring[MAX_TX_PAGES];
@@ -995,6 +996,7 @@ struct bnxt {
 
struct bnxt_rx_ring_info*rx_ring;
struct bnxt_tx_ring_info*tx_ring;
+   u16 *tx_ring_map;
 
struct sk_buff *(*gro_func)(struct bnxt_tpa_info *, int, int,
struct sk_buff *);
-- 
1.8.3.1

[PATCH net-next v3 09/12] bnxt_en: Add a set of TX rings to support XDP.

2017-02-06 Thread Michael Chan

Add logic for an extra set of TX rings for XDP.  If enabled, this
set of TX rings equals the number of RX rings and shares the same
IRQ as the RX ring set.  A new field bp->tx_nr_rings_xdp is added
to keep track of these TX XDP rings.  Adjust all other relevant functions
to handle bp->tx_nr_rings_xdp.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 21 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  3 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 16 
 3 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 811bc82..412a8de 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2218,6 +2218,8 @@ static int bnxt_alloc_tx_rings(struct bnxt *bp)
memset(txr->tx_push, 0, sizeof(struct tx_push_bd));
}
ring->queue_id = bp->q_info[j].queue_id;
+   if (i < bp->tx_nr_rings_xdp)
+   continue;
if (i % bp->tx_nr_rings_per_tc == (bp->tx_nr_rings_per_tc - 1))
j++;
}
@@ -3042,8 +3044,10 @@ static int bnxt_alloc_mem(struct bnxt *bp, bool 
irq_re_init)
for (i = 0; i < bp->tx_nr_rings; i++, j++) {
bp->tx_ring[i].bnapi = bp->bnapi[j];
bp->bnapi[j]->tx_ring = >tx_ring[i];
-   bp->tx_ring_map[i] = i;
-   bp->tx_ring[i].txq_index = i;
+   bp->tx_ring_map[i] = bp->tx_nr_rings_xdp + i;
+   if (i >= bp->tx_nr_rings_xdp)
+   bp->tx_ring[i].txq_index = i -
+   bp->tx_nr_rings_xdp;
}
 
rc = bnxt_alloc_stats(bp);
@@ -4966,7 +4970,8 @@ static int bnxt_set_real_num_queues(struct bnxt *bp)
int rc;
struct net_device *dev = bp->dev;
 
-   rc = netif_set_real_num_tx_queues(dev, bp->tx_nr_rings);
+   rc = netif_set_real_num_tx_queues(dev, bp->tx_nr_rings -
+ bp->tx_nr_rings_xdp);
if (rc)
return rc;
 
@@ -6582,7 +6587,7 @@ static void bnxt_sp_task(struct work_struct *work)
 }
 
 /* Under rtnl_lock */
-int bnxt_reserve_rings(struct bnxt *bp, int tx, int rx, int tcs)
+int bnxt_reserve_rings(struct bnxt *bp, int tx, int rx, int tcs, int tx_xdp)
 {
int max_rx, max_tx, tx_sets = 1;
int tx_rings_needed;
@@ -6602,12 +6607,12 @@ int bnxt_reserve_rings(struct bnxt *bp, int tx, int rx, 
int tcs)
if (max_rx < rx)
return -ENOMEM;
 
-   tx_rings_needed = tx * tx_sets;
+   tx_rings_needed = tx * tx_sets + tx_xdp;
if (max_tx < tx_rings_needed)
return -ENOMEM;
 
if (bnxt_hwrm_reserve_tx_rings(bp, _rings_needed) ||
-   tx_rings_needed < (tx * tx_sets))
+   tx_rings_needed < (tx * tx_sets + tx_xdp))
return -ENOMEM;
return 0;
 }
@@ -6788,8 +6793,8 @@ int bnxt_setup_mq_tc(struct net_device *dev, u8 tc)
if (bp->flags & BNXT_FLAG_SHARED_RINGS)
sh = true;
 
-   rc = bnxt_reserve_rings(bp, bp->tx_nr_rings_per_tc,
-   bp->rx_nr_rings, tc);
+   rc = bnxt_reserve_rings(bp, bp->tx_nr_rings_per_tc, bp->rx_nr_rings,
+   tc, bp->tx_nr_rings_xdp);
if (rc)
return rc;
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 97ccce1..4c3289a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1026,6 +1026,7 @@ struct bnxt {
int tx_nr_pages;
int tx_nr_rings;
int tx_nr_rings_per_tc;
+   int tx_nr_rings_xdp;
 
int tx_wake_thresh;
int tx_push_thresh;
@@ -1203,7 +1204,7 @@ int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, 
unsigned long *bmap,
 int bnxt_hwrm_fw_set_time(struct bnxt *);
 int bnxt_open_nic(struct bnxt *, bool, bool);
 int bnxt_close_nic(struct bnxt *, bool, bool);
-int bnxt_reserve_rings(struct bnxt *bp, int tx, int rx, int tcs);
+int bnxt_reserve_rings(struct bnxt *bp, int tx, int rx, int tcs, int tx_xdp);
 int bnxt_setup_mq_tc(struct net_device *dev, u8 tc);
 int bnxt_get_max_rings(struct bnxt *, int *, int *, bool);
 void bnxt_restore_pf_fw_resources(struct bnxt *bp);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 6f2568d..7aa248d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -389,6 +389,7 @@

1 2 3 >

1 - 100 of 283 matches

Mail list logo