Re: [PATCH net-next 1/3] ethtool: Ensure new ring parameters are within bounds during SRINGPARAM
On Tue, 9 Jan 2018 09:30:05 +0200 Tariq Toukan wrote: > >> diff --git a/net/core/ethtool.c b/net/core/ethtool.c > >> index 50a79203043b..9ea7cd52fde0 100644 > >> --- a/net/core/ethtool.c > >> +++ b/net/core/ethtool.c > >> @@ -1704,14 +1704,23 @@ static int ethtool_get_ringparam(struct > >> net_device *dev, void __user *useraddr) > >> static int ethtool_set_ringparam(struct net_device *dev, void > >> __user *useraddr) { > >> - struct ethtool_ringparam ringparam; > >> + struct ethtool_ringparam ringparam, max = { .cmd = > >> ETHTOOL_GRINGPARAM }; > >> - if (!dev->ethtool_ops->set_ringparam) > >> + if (!dev->ethtool_ops->set_ringparam > >> || !dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; > >> > >>if (copy_from_user(&ringparam, useraddr, > >> sizeof(ringparam))) return -EFAULT; > >> > >> + dev->ethtool_ops->get_ringparam(dev, &max); > > > > Perhaps check the return value here? It's pretty unlikely but > > get_ringparam may fail. > > > > get_ringparam NDO returns void. Ah, you're right, I looked at the return of ethtool_get_ringparam().
[patch iproute2 v7 2/2] tc: Add batchsize feature for filter and actions
Currently in tc batch mode, only one command is read from the batch file and sent to kernel to process. With this support, at most 128 commands can be accumulated before sending to kernel. Now it only works for the following successive commands: filter and actions add/delete/change/replace. Signed-off-by: Chris Mi --- tc/m_action.c | 60 +-- tc/tc.c| 127 ++--- tc/tc_common.h | 5 ++- tc/tc_filter.c | 97 +-- 4 files changed, 210 insertions(+), 79 deletions(-) diff --git a/tc/m_action.c b/tc/m_action.c index fc422364..e5c53a80 100644 --- a/tc/m_action.c +++ b/tc/m_action.c @@ -546,40 +546,56 @@ bad_val: return ret; } +struct tc_action_req { + struct nlmsghdr n; + struct tcamsg t; + charbuf[MAX_MSG]; +}; + static int tc_action_modify(int cmd, unsigned int flags, - int *argc_p, char ***argv_p) + int *argc_p, char ***argv_p, + void *buf) { - int argc = *argc_p; + struct tc_action_req *req, action_req; char **argv = *argv_p; + struct rtattr *tail; + int argc = *argc_p; + struct iovec iov; int ret = 0; - struct { - struct nlmsghdr n; - struct tcamsg t; - charbuf[MAX_MSG]; - } req = { - .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)), - .n.nlmsg_flags = NLM_F_REQUEST | flags, - .n.nlmsg_type = cmd, - .t.tca_family = AF_UNSPEC, - }; - struct rtattr *tail = NLMSG_TAIL(&req.n); + + if (buf) + req = buf; + else + req = &action_req; + + req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)); + req->n.nlmsg_flags = NLM_F_REQUEST | flags; + req->n.nlmsg_type = cmd; + req->t.tca_family = AF_UNSPEC; + tail = NLMSG_TAIL(&req->n); argc -= 1; argv += 1; - if (parse_action(&argc, &argv, TCA_ACT_TAB, &req.n)) { + if (parse_action(&argc, &argv, TCA_ACT_TAB, &req->n)) { fprintf(stderr, "Illegal \"action\"\n"); return -1; } - tail->rta_len = (void *) NLMSG_TAIL(&req.n) - (void *) tail; + tail->rta_len = (void *) NLMSG_TAIL(&req->n) - (void *) tail; + + *argc_p = argc; + *argv_p = argv; - if (rtnl_talk(&rth, &req.n, NULL) < 0) { + iov.iov_base = &req->n; + iov.iov_len = req->n.nlmsg_len; + + if (buf) + return 0; + + if (rtnl_talk_iov(&rth, &iov, 1, NULL) < 0) { fprintf(stderr, "We have an error talking to the kernel\n"); ret = -1; } - *argc_p = argc; - *argv_p = argv; - return ret; } @@ -679,7 +695,7 @@ bad_val: return ret; } -int do_action(int argc, char **argv) +int do_action(int argc, char **argv, void *buf) { int ret = 0; @@ -689,12 +705,12 @@ int do_action(int argc, char **argv) if (matches(*argv, "add") == 0) { ret = tc_action_modify(RTM_NEWACTION, NLM_F_EXCL | NLM_F_CREATE, - &argc, &argv); + &argc, &argv, buf); } else if (matches(*argv, "change") == 0 || matches(*argv, "replace") == 0) { ret = tc_action_modify(RTM_NEWACTION, NLM_F_CREATE | NLM_F_REPLACE, - &argc, &argv); + &argc, &argv, buf); } else if (matches(*argv, "delete") == 0) { argc -= 1; argv += 1; diff --git a/tc/tc.c b/tc/tc.c index ad9f07e9..f32e4978 100644 --- a/tc/tc.c +++ b/tc/tc.c @@ -193,16 +193,16 @@ static void usage(void) "-nm | -nam[es] | { -cf | -conf } path } | -j[son]\n"); } -static int do_cmd(int argc, char **argv) +static int do_cmd(int argc, char **argv, void *buf) { if (matches(*argv, "qdisc") == 0) return do_qdisc(argc-1, argv+1); if (matches(*argv, "class") == 0) return do_class(argc-1, argv+1); if (matches(*argv, "filter") == 0) - return do_filter(argc-1, argv+1); + return do_filter(argc-1, argv+1, buf); if (matches(*argv, "actions") == 0) - return do_action(argc-1, argv+1); + return do_action(argc-1, argv+1, buf); if (matches(*argv, "monitor") == 0) return do_tcmonitor(argc-1, argv+1); if (matches(*argv, "exec") == 0) @@ -217,11 +21
Re: [PATCH net-next 1/3] ethtool: Ensure new ring parameters are within bounds during SRINGPARAM
On 09/01/2018 4:23 AM, Jakub Kicinski wrote: On Mon, 8 Jan 2018 16:00:24 +0200, Tariq Toukan wrote: From: Eugenia Emantayev Add a sanity check to ensure that all requested ring parameters are within bounds, which should reduce errors in driver implementation. (y) Signed-off-by: Eugenia Emantayev Signed-off-by: Tariq Toukan --- net/core/ethtool.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 50a79203043b..9ea7cd52fde0 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1704,14 +1704,23 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { - struct ethtool_ringparam ringparam; + struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; - if (!dev->ethtool_ops->set_ringparam) + if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) return -EFAULT; + dev->ethtool_ops->get_ringparam(dev, &max); Perhaps check the return value here? It's pretty unlikely but get_ringparam may fail. get_ringparam NDO returns void. + /* ensure new ring parameters are within the maximums */ + if (ringparam.rx_pending > max.rx_max_pending || + ringparam.rx_mini_pending > max.rx_mini_max_pending || + ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending || + ringparam.tx_pending > max.tx_max_pending) + return -EINVAL; + return dev->ethtool_ops->set_ringparam(dev, &ringparam); }
Re: [PATCH] ath9k: add a quirk to set use_msi automatically
AceLan Kao writes: > Some platform(BIOS) blocks legacy interrupts (INTx), and only allows MSI > for WLAN device. So adding a quirk to list those machines and set > use_msi automatically. > Adding the following platforms to the quirk. >Dell Inspiron 24-3460 >Dell Inspiron 3472 >Dell Inspiron 14-3473 >Dell Vostro 3262 >Dell Vostro 15-3572 > > Signed-off-by: AceLan Kao [...] > @@ -96,6 +97,56 @@ static const struct ieee80211_tpt_blink ath9k_tpt_blink[] > = { > }; > #endif > > +static int __init set_use_msi(const struct dmi_system_id *dmi) > +{ > + ath9k_use_msi = 1; > + return 1; > +} > + > +static const struct dmi_system_id ath9k_quirks[] __initconst = { > + { > + .callback = set_use_msi, > + .ident = "Dell Inspiron 24-3460", > + .matches = { > + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), > + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 24-3460"), > + }, > + }, Larry, didn't rtlwifi have similar situation that with certain laptops users were required to enable a module parameter to get the device working? I think rtlwifi should do the same as AceLan does here as then the user would not need to manually set the module parameter. -- Kalle Valo
Re: [PATCH net-next v3 06/10] net/mlx5e: Change Mellanox references in DIM code
On 01/08/2018 11:06 PM, Saeed Mahameed wrote: On 01/08/2018 10:13 PM, Andy Gospodarek wrote: From: Andy Gospodarek Change all appropriate mlx5_am* and MLX5_AM* references to net_dim and NET_DIM, respectively, in code that handles dynamic interrupt moderation. Also change all references from 'am' to 'dim' when used as local variables and add generic profile references. Signed-off-by: Andy Gospodarek Acked-by: Tal Gilboa Acked-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 9 +- drivers/net/ethernet/mellanox/mlx5/core/en_dim.c | 14 +- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 6 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 40 ++- drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 8 +- drivers/net/ethernet/mellanox/mlx5/core/net_dim.c | 286 ++--- drivers/net/ethernet/mellanox/mlx5/core/net_dim.h | 63 ++--- 7 files changed, 225 insertions(+), 201 deletions(-) [...] #define IS_SIGNIFICANT_DIFF(val, ref) \ (((100 * abs((val) - (ref))) / (ref)) > 10) /* more than 10% difference */ -static int mlx5e_am_stats_compare(struct mlx5e_rx_am_stats *curr, - struct mlx5e_rx_am_stats *prev) +static int net_dim_stats_compare(struct net_dim_stats *curr, + struct net_dim_stats *prev) { if (!prev->bpms) - return curr->bpms ? MLX5E_AM_STATS_BETTER : - MLX5E_AM_STATS_SAME; + return curr->bpms ? NET_DIM_STATS_BETTER : + NET_DIM_STATS_SAME; if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms)) - return (curr->bpms > prev->bpms) ? MLX5E_AM_STATS_BETTER : - MLX5E_AM_STATS_WORSE; + return (curr->bpms > prev->bpms) ? NET_DIM_STATS_BETTER : + NET_DIM_STATS_WORSE; Hey Andy, I am currently reviewing a patch internally that fixes a bug in this area, prev->ppms can be 0 and could cause IS_SIGNIFICANT_DIFF ouch ! I meant cause division by 0 in "IS_SIGNIFICANT_DIFF" same goes for prev->eppm, for some reason we had a broken assumption that if ppms is 0 for some reason then the bpms is 0 and the above condition will cover us. Anyway the patch will go to net, which means when this series gets accepted then net-next will fail to merge with net and we need to manually push the fix to the new DIM library. But for now I don't think anything is required for this series other than bringing this division by 0 issue and the future merge conflict to your attention. if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms)) - return (curr->ppms > prev->ppms) ? MLX5E_AM_STATS_BETTER : - MLX5E_AM_STATS_WORSE; + return (curr->ppms > prev->ppms) ? NET_DIM_STATS_BETTER : + NET_DIM_STATS_WORSE; if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms)) - return (curr->epms < prev->epms) ? MLX5E_AM_STATS_BETTER : - MLX5E_AM_STATS_WORSE; + return (curr->epms < prev->epms) ? NET_DIM_STATS_BETTER : + NET_DIM_STATS_WORSE; - return MLX5E_AM_STATS_SAME; + return NET_DIM_STATS_SAME; }
Re: [PATCH net-next v3 09/10] bnxt_en: add support for software dynamic interrupt moderation
On Mon, Jan 8, 2018 at 10:13 PM, Andy Gospodarek wrote: > From: Andy Gospodarek > > This implements the changes needed for the bnxt_en driver to add support > for dynamic interrupt moderation per ring. > > This does add additional counters in the receive path, but testing shows > that any additional instructions are offset by throughput gain when the > default configuration is for low latency. > > Signed-off-by: Andy Gospodarek > Cc: Michael Chan Acked-by: Michael Chan
Re: [PATCH net-next v3 06/10] net/mlx5e: Change Mellanox references in DIM code
On 01/08/2018 10:13 PM, Andy Gospodarek wrote: From: Andy Gospodarek Change all appropriate mlx5_am* and MLX5_AM* references to net_dim and NET_DIM, respectively, in code that handles dynamic interrupt moderation. Also change all references from 'am' to 'dim' when used as local variables and add generic profile references. Signed-off-by: Andy Gospodarek Acked-by: Tal Gilboa Acked-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 9 +- drivers/net/ethernet/mellanox/mlx5/core/en_dim.c | 14 +- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 6 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 40 ++- drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 8 +- drivers/net/ethernet/mellanox/mlx5/core/net_dim.c | 286 ++--- drivers/net/ethernet/mellanox/mlx5/core/net_dim.h | 63 ++--- 7 files changed, 225 insertions(+), 201 deletions(-) [...] #define IS_SIGNIFICANT_DIFF(val, ref) \ (((100 * abs((val) - (ref))) / (ref)) > 10) /* more than 10% difference */ -static int mlx5e_am_stats_compare(struct mlx5e_rx_am_stats *curr, - struct mlx5e_rx_am_stats *prev) +static int net_dim_stats_compare(struct net_dim_stats *curr, +struct net_dim_stats *prev) { if (!prev->bpms) - return curr->bpms ? MLX5E_AM_STATS_BETTER : - MLX5E_AM_STATS_SAME; + return curr->bpms ? NET_DIM_STATS_BETTER : + NET_DIM_STATS_SAME; if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms)) - return (curr->bpms > prev->bpms) ? MLX5E_AM_STATS_BETTER : - MLX5E_AM_STATS_WORSE; + return (curr->bpms > prev->bpms) ? NET_DIM_STATS_BETTER : + NET_DIM_STATS_WORSE; Hey Andy, I am currently reviewing a patch internally that fixes a bug in this area, prev->ppms can be 0 and could cause IS_SIGNIFICANT_DIFF ouch ! same goes for prev->eppm, for some reason we had a broken assumption that if ppms is 0 for some reason then the bpms is 0 and the above condition will cover us. Anyway the patch will go to net, which means when this series gets accepted then net-next will fail to merge with net and we need to manually push the fix to the new DIM library. But for now I don't think anything is required for this series other than bringing this division by 0 issue and the future merge conflict to your attention. if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms)) - return (curr->ppms > prev->ppms) ? MLX5E_AM_STATS_BETTER : - MLX5E_AM_STATS_WORSE; + return (curr->ppms > prev->ppms) ? NET_DIM_STATS_BETTER : + NET_DIM_STATS_WORSE; if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms)) - return (curr->epms < prev->epms) ? MLX5E_AM_STATS_BETTER : - MLX5E_AM_STATS_WORSE; + return (curr->epms < prev->epms) ? NET_DIM_STATS_BETTER : + NET_DIM_STATS_WORSE; - return MLX5E_AM_STATS_SAME; + return NET_DIM_STATS_SAME; }
[patch iproute2 v7 1/2] lib/libnetlink: Add functions rtnl_talk_msg and rtnl_talk_iov
rtnl_talk can only send a single message to kernel. Add two functions rtnl_talk_msg and rtnl_talk_iov that can send multiple messages to kernel. rtnl_talk_msg takes struct msghdr * as argument. rtnl_talk_iov takes struct iovec * and iovlen as arguments. Signed-off-by: Chris Mi --- include/libnetlink.h | 6 lib/libnetlink.c | 84 2 files changed, 71 insertions(+), 19 deletions(-) diff --git a/include/libnetlink.h b/include/libnetlink.h index a4d83b9e..e9a63dbc 100644 --- a/include/libnetlink.h +++ b/include/libnetlink.h @@ -96,6 +96,12 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth, int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, struct nlmsghdr **answer) __attribute__((warn_unused_result)); +int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m, + struct nlmsghdr **answer) + __attribute__((warn_unused_result)); +int rtnl_talk_iov(struct rtnl_handle *rtnl, struct iovec *iovec, size_t iovlen, + struct nlmsghdr **answer) + __attribute__((warn_unused_result)); int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n, struct nlmsghdr **answer, nl_ext_ack_fn_t errfn) __attribute__((warn_unused_result)); diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 00e6ce0c..ae0059f9 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -581,39 +581,43 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct nlmsgerr *err, strerror(-err->error)); } -static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, - struct nlmsghdr **answer, - bool show_rtnl_err, nl_ext_ack_fn_t errfn) +static int __rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m, + struct nlmsghdr **answer, + bool show_rtnl_err, nl_ext_ack_fn_t errfn) { - int status; - unsigned int seq; - struct nlmsghdr *h; struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; - struct iovec iov = { - .iov_base = n, - .iov_len = n->nlmsg_len - }; + int i, status, iovlen = m->msg_iovlen; + struct iovec iov; struct msghdr msg = { .msg_name = &nladdr, .msg_namelen = sizeof(nladdr), .msg_iov = &iov, .msg_iovlen = 1, }; - char *buf; - - n->nlmsg_seq = seq = ++rtnl->seq; + unsigned int seq = 0; + struct nlmsghdr *h; - if (answer == NULL) - n->nlmsg_flags |= NLM_F_ACK; + for (i = 0; i < iovlen; i++) { + struct iovec *v; + v = &m->msg_iov[i]; + h = v->iov_base; + h->nlmsg_seq = seq = ++rtnl->seq; + if (answer == NULL) + h->nlmsg_flags |= NLM_F_ACK; + } - status = sendmsg(rtnl->fd, &msg, 0); + status = sendmsg(rtnl->fd, m, 0); if (status < 0) { perror("Cannot talk to rtnetlink"); return -1; } + i = 0; while (1) { + char *buf; +next: status = rtnl_recvmsg(rtnl->fd, &msg, &buf); + ++i; if (status < 0) return status; @@ -642,7 +646,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, if (nladdr.nl_pid != 0 || h->nlmsg_pid != rtnl->local.nl_pid || - h->nlmsg_seq != seq) { + h->nlmsg_seq > seq || h->nlmsg_seq < seq - iovlen) { /* Don't forget to skip that message. */ status -= NLMSG_ALIGN(len); h = (struct nlmsghdr *)((char *)h + NLMSG_ALIGN(len)); @@ -662,7 +666,10 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, *answer = (struct nlmsghdr *)buf; else free(buf); - return 0; + if (h->nlmsg_seq == seq) + return 0; + else + goto next; } if (rtnl->proto != NETLINK_SOCK_DIAG && @@ -671,7 +678,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, errno = -err->error; free(buf); - return -1; + return -i; } if (answer) { @@ -698,12 +705,51 @@ static int __rtnl_talk(struct r
[patch iproute2 v7 0/2] tc: Add batchsize feature to batch mode
Currently in tc batch mode, only one command is read from the batch file and sent to kernel to process. With this patchset, at most 128 commands can be accumulated before sending to kernel. We introduced two new functions in patch 1 to support for sending multiple messages. In patch 2, we add this support for filter and actions add/delete/change/replace commands. But please note that kernel still processes the requests one by one. To process the requests in parallel in kernel is another effort. The time we're saving in this patchset is the user mode and kernel mode context switch. So this patchset works on top of the current kernel. Using the following script in kernel, we can generate 1,000,000 rules. tools/testing/selftests/tc-testing/tdc_batch.py Without this patchset, 'tc -b $file' exection time is: real0m15.555s user0m7.211s sys 0m8.284s With this patchset, 'tc -b $file' exection time is: real0m13.562s user0m6.463s sys 0m7.031s The insertion rate is improved more than 10%. v3 == 1. Instead of hacking function rtnl_talk directly, add a new function rtnl_talk_msg. 2. remove most of global variables to use parameter passing 3. divide the previous patch into 4 patches. v4 == 1. Remove function setcmdlinetotal. Now in function batch, we read one more line to determine if we are reaching the end of file. 2. Remove function __rtnl_check_ack. Now __rtnl_talk calls __rtnl_talk_msg directly. 3. if (batch_size < 1) batch_size = 1; v5 == 1. Fix a bug that can't deal with batch file with blank line. 2. Describe the limitation in man page. v6 == 1. Add support for mixed commands. 2. Fix a bug that not all messages are acked if batch size > 1. v7 == 1. We can tell exactly which command fails. 2. Add a new function rtnl_talk_iov 3. Allocate the memory in function batch() instead of each client. 4. Remove option -bs. Chris Mi (2): lib/libnetlink: Add functions rtnl_talk_msg and rtnl_talk_iov tc: Add batchsize feature to batch mode include/libnetlink.h | 6 +++ lib/libnetlink.c | 84 ++ tc/m_action.c| 60 +++- tc/tc.c | 127 +-- tc/tc_common.h | 5 +- tc/tc_filter.c | 97 +++ 6 files changed, 281 insertions(+), 98 deletions(-) -- 2.14.3
Re: [PATCH net-next v3 00/10] net: create dynamic software irq moderation library
On 1/9/2018 8:13 AM, Andy Gospodarek wrote: From: Andy Gospodarek This converts the dynamic interrupt moderation library from the mlx5e driver into a library so it can be used by any driver. The penultimate patch in this set adds support for thiw new dynamic interrupt moderation library in the bnxt_en driver and the last patch creates an entry in the MAINTAINERS file for this library. thiw->this.
[PATCH] ipvlan: fix ipvlan MTU limits
From: Keefe Liu The MTU of ipvlan interface should not bigger than the phy device, When we run following scripts, we will find there are some problems. Step1: ip link add link eth0 name ipv1 type ipvlan mode l2 ip netns add net1 ip link set dev ipv1 netns net1 Step2: ip netns exec net1 ip link set dev ipv1 mtu 1501 RTNETLINK answers: Invalid argument dmesg info: "ipv1: Invalid MTU 1501 requested, hw max 1500" Step3: ip link set dev eth0 mtu 1600 ip netns exec net1 ip link set dev ipv1 mtu 1501 RTNETLINK answers: Invalid argument dmesg info: "ipv1: Invalid MTU 1501 requested, hw max 1500" Step4: ip link set dev eth0 mtu 1400 ip netns exec net1 ip link set dev ipv1 mtu 1500 The result of Step2 is we expected, but the result of Step3 and Step4 are not. This patch set ipvlan's maximum MTU to ETH_MAX_MTU, and when we change the ipvlan device's MTU, ipvlan_change_mtu() will make sure the new MTU no larger than the phy device's MTU. Signed-off-by: Keefe Liu --- drivers/net/ipvlan/ipvlan_main.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 30cb803..84c007d 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -380,12 +380,24 @@ static int ipvlan_get_iflink(const struct net_device *dev) return ipvlan->phy_dev->ifindex; } +static int ipvlan_change_mtu(struct net_device *dev, int new_mtu) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + if (ipvlan->phy_dev->mtu < new_mtu) + return -EINVAL; + + dev->mtu = new_mtu; + return 0; +} + static const struct net_device_ops ipvlan_netdev_ops = { .ndo_init = ipvlan_init, .ndo_uninit = ipvlan_uninit, .ndo_open = ipvlan_open, .ndo_stop = ipvlan_stop, .ndo_start_xmit = ipvlan_start_xmit, + .ndo_change_mtu = ipvlan_change_mtu, .ndo_fix_features = ipvlan_fix_features, .ndo_change_rx_flags= ipvlan_change_rx_flags, .ndo_set_rx_mode= ipvlan_set_multicast_mac_filter, @@ -680,6 +692,8 @@ void ipvlan_link_setup(struct net_device *dev) { ether_setup(dev); + dev->min_mtu = 0; + dev->max_mtu = ETH_MAX_MTU; dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE; dev->netdev_ops = &ipvlan_netdev_ops; -- 1.8.3.1
RE: [patch iproute2 v6 2/3] tc: Add -bs option to batch mode
> -Original Message- > From: Marcelo Ricardo Leitner [mailto:marcelo.leit...@gmail.com] > Sent: Saturday, January 6, 2018 3:15 AM > To: David Ahern > Cc: Chris Mi ; netdev@vger.kernel.org; > gerlitz...@gmail.com; step...@networkplumber.org > Subject: Re: [patch iproute2 v6 2/3] tc: Add -bs option to batch mode > > On Fri, Jan 05, 2018 at 11:15:59AM -0700, David Ahern wrote: > > On 1/4/18 12:34 AM, Chris Mi wrote: > > > Currently in tc batch mode, only one command is read from the batch > > > file and sent to kernel to process. With this support, we can > > > accumulate several commands before sending to kernel. > > > > > > Now it only works for the following successive rules, 1. filter add > > > 2. filter delete 3. actions add 4. actions delete > > > > > > Otherwise, the batch size is still 1. > > > > > > Signed-off-by: Chris Mi > > > --- > > > tc/m_action.c | 93 ++-- > > > tc/tc.c| 96 +++-- > > > tc/tc_common.h | 8 +++- > > > tc/tc_filter.c | 132 > > > - > > > 4 files changed, 252 insertions(+), 77 deletions(-) > > > > > > diff --git a/tc/m_action.c b/tc/m_action.c index fc422364..cf5cc95d > > > 100644 > > > --- a/tc/m_action.c > > > +++ b/tc/m_action.c > > > @@ -23,6 +23,7 @@ > > > #include > > > #include > > > #include > > > +#include > > > > > > #include "utils.h" > > > #include "tc_common.h" > > > @@ -546,40 +547,86 @@ bad_val: > > > return ret; > > > } > > > > > > +typedef struct { > > > + struct nlmsghdr n; > > > + struct tcamsg t; > > > + charbuf[MAX_MSG]; > > > +} tc_action_req; > > > + > > > +static tc_action_req *action_reqs; > > > +static struct iovec msg_iov[MSG_IOV_MAX]; > > > + > > > +void free_action_reqs(void) > > > +{ > > > + free(action_reqs); > > > +} > > > + > > > +static tc_action_req *get_action_req(int batch_size, int index) { > > > + tc_action_req *req; > > > + > > > + if (action_reqs == NULL) { > > > + action_reqs = malloc(batch_size * sizeof (tc_action_req)); > > > + if (action_reqs == NULL) > > > + return NULL; > > > + } > > > + req = &action_reqs[index]; > > > + memset(req, 0, sizeof (*req)); > > > + > > > + return req; > > > +} > > > + > > > static int tc_action_modify(int cmd, unsigned int flags, > > > - int *argc_p, char ***argv_p) > > > + int *argc_p, char ***argv_p, > > > + int batch_size, int index, bool send) > > > { > > > - int argc = *argc_p; > > > + struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; > > > + struct iovec *iov = &msg_iov[index]; > > > char **argv = *argv_p; > > > - int ret = 0; > > > - struct { > > > - struct nlmsghdr n; > > > - struct tcamsg t; > > > - charbuf[MAX_MSG]; > > > - } req = { > > > - .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)), > > > - .n.nlmsg_flags = NLM_F_REQUEST | flags, > > > - .n.nlmsg_type = cmd, > > > - .t.tca_family = AF_UNSPEC, > > > + struct msghdr msg = { > > > + .msg_name = &nladdr, > > > + .msg_namelen = sizeof(nladdr), > > > + .msg_iov = msg_iov, > > > + .msg_iovlen = index + 1, > > > }; > > > - struct rtattr *tail = NLMSG_TAIL(&req.n); > > > + struct rtattr *tail; > > > + tc_action_req *req; > > > + int argc = *argc_p; > > > + int ret = 0; > > > + > > > + req = get_action_req(batch_size, index); > > > + if (req == NULL) { > > > + fprintf(stderr, "get_action_req error: not enough buffer\n"); > > > + return -ENOMEM; > > > + } > > > + req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)); > > > + req->n.nlmsg_flags = NLM_F_REQUEST | flags; > > > + req->n.nlmsg_type = cmd; > > > + req->t.tca_family = AF_UNSPEC; > > > + tail = NLMSG_TAIL(&req->n); > > > > > > argc -= 1; > > > argv += 1; > > > - if (parse_action(&argc, &argv, TCA_ACT_TAB, &req.n)) { > > > + if (parse_action(&argc, &argv, TCA_ACT_TAB, &req->n)) { > > > fprintf(stderr, "Illegal \"action\"\n"); > > > return -1; > > > } > > > - tail->rta_len = (void *) NLMSG_TAIL(&req.n) - (void *) tail; > > > + tail->rta_len = (void *) NLMSG_TAIL(&req->n) - (void *) tail; > > > > > > - if (rtnl_talk(&rth, &req.n, NULL) < 0) { > > > + *argc_p = argc; > > > + *argv_p = argv; > > > + > > > + iov->iov_base = &req->n; > > > + iov->iov_len = req->n.nlmsg_len; > > > + > > > + if (!send) > > > + return 0; > > > + > > > + if (rtnl_talk_msg(&rth, &msg, NULL) < 0) { > > > fprintf(stderr, "We have an error talking to the kernel\n"); > > > ret = -1; > > > } > > > > > > - *argc_p = argc; > > > - *argv_p = argv; > > > - > > > return ret; > > > } > > > > > > @@ -679,7 +726,7 @@ bad_val: > > > return ret; > > > } > > > > > > -int do_action(int argc, char **argv) > > > +int do_action(int
[net-next 05/10] net/mlx5e: IPoIB, Use correct timestamp in child receive flow
From: Feras Daoud The current implementation takes the child timestamp object from the parent since the rq in mlx5i_complete_rx_cqe belongs to the parent. This change fixes the issue by taking the correct timestamp. Fixes: 7e7f4780c340 ("net/mlx5e: IPoIB, Use hash-table to map between QPN to child netdev") Signed-off-by: Feras Daoud Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 90354e676f0d..ff234dfefc27 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1175,7 +1175,9 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq, u32 cqe_bcnt, struct sk_buff *skb) { + struct hwtstamp_config *tstamp; struct net_device *netdev; + struct mlx5e_priv *priv; char *pseudo_header; u32 qpn; u8 *dgid; @@ -1194,6 +1196,9 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq, return; } + priv = mlx5i_epriv(netdev); + tstamp = &priv->tstamp; + g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3; dgid = skb->data + MLX5_IB_GRH_DGID_OFFSET; if ((!g) || dgid[0] != 0xff) @@ -1214,7 +1219,7 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq, skb->ip_summed = CHECKSUM_COMPLETE; skb->csum = csum_unfold((__force __sum16)cqe->check_sum); - if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp))) + if (unlikely(mlx5e_rx_hw_stamp(tstamp))) skb_hwtstamps(skb)->hwtstamp = mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe)); -- 2.13.0
[net-next 02/10] net/mlx5: Hairpin pair core object setup
From: Or Gerlitz Low level code to setup hairpin pair core object, deals with: - create hairpin RQs/SQs - destroy hairpin RQs/SQs - modifying hairpin RQs/SQs - pairing (rst2rdy) and unpairing (rdy2rst) Unlike conventional RQs/SQs, the memory used for the packet and descriptor buffers is allocated by the firmware and not the driver. The driver sets the overall data size (log). Signed-off-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 184 + include/linux/mlx5/transobj.h | 19 +++ 2 files changed, 203 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c index 5e128d7a9ffd..a09ebbaf3b68 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c @@ -398,3 +398,187 @@ void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn) mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_destroy_rqt); + +static int mlx5_hairpin_create_rq(struct mlx5_core_dev *mdev, + struct mlx5_hairpin_params *params, u32 *rqn) +{ + u32 in[MLX5_ST_SZ_DW(create_rq_in)] = {0}; + void *rqc, *wq; + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + wq = MLX5_ADDR_OF(rqc, rqc, wq); + + MLX5_SET(rqc, rqc, hairpin, 1); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, counter_set_id, params->q_counter); + + MLX5_SET(wq, wq, log_hairpin_data_sz, params->log_data_size); + + return mlx5_core_create_rq(mdev, in, MLX5_ST_SZ_BYTES(create_rq_in), rqn); +} + +static int mlx5_hairpin_create_sq(struct mlx5_core_dev *mdev, + struct mlx5_hairpin_params *params, u32 *sqn) +{ + u32 in[MLX5_ST_SZ_DW(create_sq_in)] = {0}; + void *sqc, *wq; + + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + wq = MLX5_ADDR_OF(sqc, sqc, wq); + + MLX5_SET(sqc, sqc, hairpin, 1); + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); + + MLX5_SET(wq, wq, log_hairpin_data_sz, params->log_data_size); + + return mlx5_core_create_sq(mdev, in, MLX5_ST_SZ_BYTES(create_sq_in), sqn); +} + +static int mlx5_hairpin_create_queues(struct mlx5_hairpin *hp, + struct mlx5_hairpin_params *params) +{ + int err; + + err = mlx5_hairpin_create_rq(hp->func_mdev, params, &hp->rqn); + if (err) + goto out_err_rq; + + err = mlx5_hairpin_create_sq(hp->peer_mdev, params, &hp->sqn); + if (err) + goto out_err_sq; + + return 0; + +out_err_sq: + mlx5_core_destroy_rq(hp->func_mdev, hp->rqn); +out_err_rq: + return err; +} + +static void mlx5_hairpin_destroy_queues(struct mlx5_hairpin *hp) +{ + mlx5_core_destroy_rq(hp->func_mdev, hp->rqn); + mlx5_core_destroy_sq(hp->peer_mdev, hp->sqn); +} + +static int mlx5_hairpin_modify_rq(struct mlx5_core_dev *func_mdev, u32 rqn, + int curr_state, int next_state, + u16 peer_vhca, u32 peer_sq) +{ + u32 in[MLX5_ST_SZ_DW(modify_rq_in)] = {0}; + void *rqc; + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + + if (next_state == MLX5_RQC_STATE_RDY) { + MLX5_SET(rqc, rqc, hairpin_peer_sq, peer_sq); + MLX5_SET(rqc, rqc, hairpin_peer_vhca, peer_vhca); + } + + MLX5_SET(modify_rq_in, in, rq_state, curr_state); + MLX5_SET(rqc, rqc, state, next_state); + + return mlx5_core_modify_rq(func_mdev, rqn, + in, MLX5_ST_SZ_BYTES(modify_rq_in)); +} + +static int mlx5_hairpin_modify_sq(struct mlx5_core_dev *peer_mdev, u32 sqn, + int curr_state, int next_state, + u16 peer_vhca, u32 peer_rq) +{ + u32 in[MLX5_ST_SZ_DW(modify_sq_in)] = {0}; + void *sqc; + + sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); + + if (next_state == MLX5_RQC_STATE_RDY) { + MLX5_SET(sqc, sqc, hairpin_peer_rq, peer_rq); + MLX5_SET(sqc, sqc, hairpin_peer_vhca, peer_vhca); + } + + MLX5_SET(modify_sq_in, in, sq_state, curr_state); + MLX5_SET(sqc, sqc, state, next_state); + + return mlx5_core_modify_sq(peer_mdev, sqn, + in, MLX5_ST_SZ_BYTES(modify_sq_in)); +} + +static int mlx5_hairpin_pair_queues(struct mlx5_hairpin *hp) +{ + int err; + + /* set peer SQ */ + err = mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn, +MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY, +MLX5_CAP_GEN(hp->func_mdev, vhca_id), hp->rqn); + if (err) + goto err_modify_sq; + + /* set func RQ */ + err = mlx5_hairpin_modify_rq(hp->func_mdev,
[net-next 07/10] net/mlx5e: IPoIB, Add ethtool support to get child time stamping parameters
From: Feras Daoud Add support to get time stamping capabilities using ethtool for child interface. Usage example: ethtool -T CHILD-DEVNAME This change reuses the functionality of parent devices and does not introduce any new logic. Signed-off-by: Feras Daoud Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c index 6f338a9219c8..90cb50fe17fd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c @@ -254,4 +254,5 @@ const struct ethtool_ops mlx5i_ethtool_ops = { const struct ethtool_ops mlx5i_pkey_ethtool_ops = { .get_drvinfo= mlx5i_get_drvinfo, .get_link = ethtool_op_get_link, + .get_ts_info= mlx5i_get_ts_info, }; -- 2.13.0
[pull request][net-next 00/10] Mellanox, mlx5 updates 2018-01-08
Hi Dave, This series includes updates for mlx5 driver, for full information please see tag log message below. The series doesn't cause any conflict with Andy's "net: create dynamic software irq moderation library". Both merge together seamlessly. Please pull and let me know if there's any problem. Thanks, Saeed. --- The following changes since commit f4803f1b73f877a571be4c8e531dfcf190acc691: net: tipc: remove unused hardirq.h (2018-01-08 20:59:25 -0500) are available in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git tags/mlx5-updates-2018-01-08 for you to fetch changes up to b8a0dbe3a90b2debd72cd9a304eacac55c44e5a4: net/mlx5e: E-switch, Add steering drop counters (2018-01-09 07:40:48 +0200) mlx5-updates-2018-01-08 Four patches from Or that add Hairpin support to mlx5: === From: Or Gerlitz We refer the ability of NIC HW to fwd packet received on one port to the other port (also from a port to itself) as hairpin. The application API is based on ingress tc/flower rules set on the NIC with the mirred redirect action. Other actions can apply to packets during the redirect. Hairpin allows to offload the data-path of various SW DDoS gateways, load-balancers, etc to HW. Packets go through all the required processing in HW (header re-write, encap/decap, push/pop vlan) and then forwarded, CPU stays at practically zero usage. HW Flow counters are used by the control plane for monitoring and accounting. Hairpin is implemented by pairing a receive queue (RQ) to send queue (SQ). All the flows that share are redirected through the same hairpin pair. Currently, only header-rewrite is supported as a packet modification action. I'd like to thanks Elijah Shakkour for implementing this functionality on HW simulator, before it was avail in the FW so the driver code could be tested early. === >From Feras three patches that provide very small changes that allow IPoIB to support RX timestamping for child interfaces, simply by hooking the mlx5e timestamping PTP ioctl to IPoIB child interface netdev profile. One patch from Gal to fix a spilling mistake. Two patches from Eugenia adds drop counters to VF statistics to be reported as part of VF statistics in netlink (iproute2) and implemented them in mlx5 eswitch. Eugenia Emantayev (2): net/core: Add drop counters to VF statistics net/mlx5e: E-switch, Add steering drop counters Feras Daoud (3): net/mlx5e: IPoIB, Use correct timestamp in child receive flow net/mlx5e: IPoIB, Add PTP ioctl support for child interface net/mlx5e: IPoIB, Add ethtool support to get child time stamping parameters Gal Pressman (1): net/mlx5e: IPoIB, Fix spelling mistake "functionts" -> "functions" Or Gerlitz (4): net/mlx5: Add hairpin definitions to the FW API net/mlx5: Hairpin pair core object setup net/mlx5e: Basic setup of hairpin object net/mlx5e: Support offloading TC NIC hairpin flows drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_rx.c| 7 +- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c| 280 - drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 99 +++- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 7 + drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 2 + .../net/ethernet/mellanox/mlx5/core/fs_counters.c | 6 + .../ethernet/mellanox/mlx5/core/ipoib/ethtool.c| 1 + .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 3 +- .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h | 3 +- .../ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c | 7 + drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 184 ++ include/linux/if_link.h| 2 + include/linux/mlx5/mlx5_ifc.h | 43 +++- include/linux/mlx5/transobj.h | 19 ++ include/uapi/linux/if_link.h | 2 + net/core/rtnetlink.c | 10 +- 17 files changed, 649 insertions(+), 27 deletions(-)
[net-next 08/10] net/mlx5e: IPoIB, Fix spelling mistake "functionts" -> "functions"
From: Gal Pressman Fix trivial spelling mistake: "functionts" -> "functions". Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h index 5e87d04652d2..6d9053bcbe95 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h @@ -76,7 +76,7 @@ int mlx5i_pkey_del_qpn(struct net_device *netdev, u32 qpn); /* Get the net-device corresponding to the given underlay QPN */ struct net_device *mlx5i_pkey_get_netdev(struct net_device *netdev, u32 qpn); -/* Shared ndo functionts */ +/* Shared ndo functions */ int mlx5i_dev_init(struct net_device *dev); void mlx5i_dev_cleanup(struct net_device *dev); int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); -- 2.13.0
[net-next 04/10] net/mlx5e: Support offloading TC NIC hairpin flows
From: Or Gerlitz We refer to TC NIC rule that involves forwarding as "hairpin". All hairpin rules from the current NIC device (called "func" in the code) to a given NIC device ("peer") are steered into the same hairpin RQ/SQ pair. The hairpin pair is set on demand and removed when there are no TC rules that need it. Here's a TC rule that matches on icmp, does header re-write of the dst mac and hairpin from RX/enp1s2f1 to TX/enp1s2f2 (enp1s2f1/2 are two mlx5 devices): tc filter add dev enp1s2f1 protocol ip parent : prio 2 flower skip_sw ip_proto icmp action pedit ex munge eth dst set 10:22:33:44:55:66 pipe action mirred egress redirect dev enp1s2f2 Signed-off-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h| 1 + drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 183 ++-- 2 files changed, 172 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 5299310f2481..72bab8d3f4b0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -659,6 +659,7 @@ struct mlx5e_tc_table { struct rhashtable ht; DECLARE_HASHTABLE(mod_hdr_tbl, 8); + DECLARE_HASHTABLE(hairpin_tbl, 8); }; struct mlx5e_vlan_table { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 55a527bda2e5..cf528da51243 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -56,12 +56,14 @@ struct mlx5_nic_flow_attr { u32 action; u32 flow_tag; u32 mod_hdr_id; + u32 hairpin_tirn; }; enum { MLX5E_TC_FLOW_ESWITCH = BIT(0), MLX5E_TC_FLOW_NIC = BIT(1), MLX5E_TC_FLOW_OFFLOADED = BIT(2), + MLX5E_TC_FLOW_HAIRPIN = BIT(3), }; struct mlx5e_tc_flow { @@ -71,6 +73,7 @@ struct mlx5e_tc_flow { struct mlx5_flow_handle *rule; struct list_headencap; /* flows sharing the same encap ID */ struct list_headmod_hdr; /* flows sharing the same mod hdr ID */ + struct list_headhairpin; /* flows sharing the same hairpin */ union { struct mlx5_esw_flow_attr esw_attr[0]; struct mlx5_nic_flow_attr nic_attr[0]; @@ -101,6 +104,17 @@ struct mlx5e_hairpin { u32 tirn; }; +struct mlx5e_hairpin_entry { + /* a node of a hash table which keeps all the hairpin entries */ + struct hlist_node hairpin_hlist; + + /* flows sharing the same hairpin */ + struct list_head flows; + + int peer_ifindex; + struct mlx5e_hairpin *hp; +}; + struct mod_hdr_key { int num_actions; void *actions; @@ -319,6 +333,98 @@ static void mlx5e_hairpin_destroy(struct mlx5e_hairpin *hp) kvfree(hp); } +static struct mlx5e_hairpin_entry *mlx5e_hairpin_get(struct mlx5e_priv *priv, +int peer_ifindex) +{ + struct mlx5e_hairpin_entry *hpe; + + hash_for_each_possible(priv->fs.tc.hairpin_tbl, hpe, + hairpin_hlist, peer_ifindex) { + if (hpe->peer_ifindex == peer_ifindex) + return hpe; + } + + return NULL; +} + +static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5e_tc_flow_parse_attr *parse_attr) +{ + int peer_ifindex = parse_attr->mirred_ifindex; + struct mlx5_hairpin_params params; + struct mlx5e_hairpin_entry *hpe; + struct mlx5e_hairpin *hp; + int err; + + if (!MLX5_CAP_GEN(priv->mdev, hairpin)) { + netdev_warn(priv->netdev, "hairpin is not supported\n"); + return -EOPNOTSUPP; + } + + hpe = mlx5e_hairpin_get(priv, peer_ifindex); + if (hpe) + goto attach_flow; + + hpe = kzalloc(sizeof(*hpe), GFP_KERNEL); + if (!hpe) + return -ENOMEM; + + INIT_LIST_HEAD(&hpe->flows); + hpe->peer_ifindex = peer_ifindex; + + params.log_data_size = 15; + params.log_data_size = min_t(u8, params.log_data_size, +MLX5_CAP_GEN(priv->mdev, log_max_hairpin_wq_data_sz)); + params.log_data_size = max_t(u8, params.log_data_size, +MLX5_CAP_GEN(priv->mdev, log_min_hairpin_wq_data_sz)); + params.q_counter = priv->q_counter; + + hp = mlx5e_hairpin_create(priv, ¶ms, peer_ifindex); + if (IS_ERR(hp)) { + err = PTR_ERR(hp); + goto create_hairpin_err; + } + + netdev_dbg(priv->netdev, "add hairpin: tirn %x rqn %x peer %s sqn %x log data size %d\n", + hp->tirn, hp->pair->rqn, hp-
[net-next 03/10] net/mlx5e: Basic setup of hairpin object
From: Or Gerlitz Add the code to do basic setup for hairpin object which will later serve offloading TC flows. This includes calling the mlx5 core to create/destroy the hairpin pair object and setting the HW transport objects that will be used for steering matched flows to go through hairpin. Signed-off-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 97 + 1 file changed, 97 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 933275fe03b2..55a527bda2e5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -93,6 +93,14 @@ enum { #define MLX5E_TC_TABLE_NUM_GROUPS 4 #define MLX5E_TC_TABLE_MAX_GROUP_SIZE (1 << 16) +struct mlx5e_hairpin { + struct mlx5_hairpin *pair; + + struct mlx5_core_dev *func_mdev; + u32 tdn; + u32 tirn; +}; + struct mod_hdr_key { int num_actions; void *actions; @@ -222,6 +230,95 @@ static void mlx5e_detach_mod_hdr(struct mlx5e_priv *priv, } } +static +struct mlx5_core_dev *mlx5e_hairpin_get_mdev(struct net *net, int ifindex) +{ + struct net_device *netdev; + struct mlx5e_priv *priv; + + netdev = __dev_get_by_index(net, ifindex); + priv = netdev_priv(netdev); + return priv->mdev; +} + +static int mlx5e_hairpin_create_transport(struct mlx5e_hairpin *hp) +{ + u32 in[MLX5_ST_SZ_DW(create_tir_in)] = {0}; + void *tirc; + int err; + + err = mlx5_core_alloc_transport_domain(hp->func_mdev, &hp->tdn); + if (err) + goto alloc_tdn_err; + + tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); + + MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); + MLX5_SET(tirc, tirc, inline_rqn, hp->pair->rqn); + MLX5_SET(tirc, tirc, transport_domain, hp->tdn); + + err = mlx5_core_create_tir(hp->func_mdev, in, MLX5_ST_SZ_BYTES(create_tir_in), &hp->tirn); + if (err) + goto create_tir_err; + + return 0; + +create_tir_err: + mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn); +alloc_tdn_err: + return err; +} + +static void mlx5e_hairpin_destroy_transport(struct mlx5e_hairpin *hp) +{ + mlx5_core_destroy_tir(hp->func_mdev, hp->tirn); + mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn); +} + +static struct mlx5e_hairpin * +mlx5e_hairpin_create(struct mlx5e_priv *priv, struct mlx5_hairpin_params *params, +int peer_ifindex) +{ + struct mlx5_core_dev *func_mdev, *peer_mdev; + struct mlx5e_hairpin *hp; + struct mlx5_hairpin *pair; + int err; + + hp = kzalloc(sizeof(*hp), GFP_KERNEL); + if (!hp) + return ERR_PTR(-ENOMEM); + + func_mdev = priv->mdev; + peer_mdev = mlx5e_hairpin_get_mdev(dev_net(priv->netdev), peer_ifindex); + + pair = mlx5_core_hairpin_create(func_mdev, peer_mdev, params); + if (IS_ERR(pair)) { + err = PTR_ERR(pair); + goto create_pair_err; + } + hp->pair = pair; + hp->func_mdev = func_mdev; + + err = mlx5e_hairpin_create_transport(hp); + if (err) + goto create_transport_err; + + return hp; + +create_transport_err: + mlx5_core_hairpin_destroy(hp->pair); +create_pair_err: + kfree(hp); + return ERR_PTR(err); +} + +static void mlx5e_hairpin_destroy(struct mlx5e_hairpin *hp) +{ + mlx5e_hairpin_destroy_transport(hp); + mlx5_core_hairpin_destroy(hp->pair); + kvfree(hp); +} + static struct mlx5_flow_handle * mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv, struct mlx5e_tc_flow_parse_attr *parse_attr, -- 2.13.0
[net-next 09/10] net/core: Add drop counters to VF statistics
From: Eugenia Emantayev Modern hardware can decide to drop packets going to/from a VF. Add receive and transmit drop counters to be displayed at hypervisor layer in iproute2 per VF statistics. Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- include/linux/if_link.h | 2 ++ include/uapi/linux/if_link.h | 2 ++ net/core/rtnetlink.c | 10 +- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/if_link.h b/include/linux/if_link.h index 4c54611e03e9..622658dfbf0a 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -13,6 +13,8 @@ struct ifla_vf_stats { __u64 tx_bytes; __u64 broadcast; __u64 multicast; + __u64 rx_dropped; + __u64 tx_dropped; }; struct ifla_vf_info { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 19fc02660e0c..f8f04fed6186 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -732,6 +732,8 @@ enum { IFLA_VF_STATS_BROADCAST, IFLA_VF_STATS_MULTICAST, IFLA_VF_STATS_PAD, + IFLA_VF_STATS_RX_DROPPED, + IFLA_VF_STATS_TX_DROPPED, __IFLA_VF_STATS_MAX, }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c688dc564b11..5421a3fd3ba1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -904,6 +904,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ nla_total_size_64bit(sizeof(__u64)) + +/* IFLA_VF_STATS_RX_DROPPED */ +nla_total_size_64bit(sizeof(__u64)) + +/* IFLA_VF_STATS_TX_DROPPED */ +nla_total_size_64bit(sizeof(__u64)) + nla_total_size(sizeof(struct ifla_vf_trust))); return size; } else @@ -1258,7 +1262,11 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD)) { + vf_stats.multicast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, + vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, + vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { nla_nest_cancel(skb, vfstats); goto nla_put_vf_failure; } -- 2.13.0
[net-next 06/10] net/mlx5e: IPoIB, Add PTP ioctl support for child interface
From: Feras Daoud Add support to control precision time protocol on child interfaces using ioctl. This commit changes the following: - Change parent ioctl function to be non static - Reuse the parent ioctl function in child devices Signed-off-by: Feras Daoud Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 3 +-- drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c | 7 +++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 8812d7208e8f..3b2363e93ba5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -41,7 +41,6 @@ static int mlx5i_open(struct net_device *netdev); static int mlx5i_close(struct net_device *netdev); static int mlx5i_change_mtu(struct net_device *netdev, int new_mtu); -static int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); static const struct net_device_ops mlx5i_netdev_ops = { .ndo_open= mlx5i_open, @@ -396,7 +395,7 @@ int mlx5i_dev_init(struct net_device *dev) return 0; } -static int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct mlx5e_priv *priv = mlx5i_epriv(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h index 49008022c306..5e87d04652d2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h @@ -79,6 +79,7 @@ struct net_device *mlx5i_pkey_get_netdev(struct net_device *netdev, u32 qpn); /* Shared ndo functionts */ int mlx5i_dev_init(struct net_device *dev); void mlx5i_dev_cleanup(struct net_device *dev); +int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); /* Parent profile functions */ void mlx5i_init(struct mlx5_core_dev *mdev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c index 531b02cc979b..b69e9d847a6b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c @@ -140,6 +140,7 @@ static int mlx5i_pkey_close(struct net_device *netdev); static int mlx5i_pkey_dev_init(struct net_device *dev); static void mlx5i_pkey_dev_cleanup(struct net_device *netdev); static int mlx5i_pkey_change_mtu(struct net_device *netdev, int new_mtu); +static int mlx5i_pkey_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); static const struct net_device_ops mlx5i_pkey_netdev_ops = { .ndo_open= mlx5i_pkey_open, @@ -147,6 +148,7 @@ static const struct net_device_ops mlx5i_pkey_netdev_ops = { .ndo_init= mlx5i_pkey_dev_init, .ndo_uninit = mlx5i_pkey_dev_cleanup, .ndo_change_mtu = mlx5i_pkey_change_mtu, + .ndo_do_ioctl= mlx5i_pkey_ioctl, }; /* Child NDOs */ @@ -174,6 +176,11 @@ static int mlx5i_pkey_dev_init(struct net_device *dev) return mlx5i_dev_init(dev); } +static int mlx5i_pkey_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + return mlx5i_ioctl(dev, ifr, cmd); +} + static void mlx5i_pkey_dev_cleanup(struct net_device *netdev) { return mlx5i_dev_cleanup(netdev); -- 2.13.0
[net-next 10/10] net/mlx5e: E-switch, Add steering drop counters
From: Eugenia Emantayev Add flow counters to count packets dropped due to drop rules configured in eswitch egress and ingress ACLs. These counters will count VFs violations and incoming traffic drops. Will be presented on hypervisor via standard 'ip -s link show' command. Example: "ip -s link show dev enp5s0f0" 6: enp5s0f0: mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000 link/ether 24:8a:07:a5:28:f0 brd ff:ff:ff:ff:ff:ff RX: bytes packets errors dropped overrun mcast 0 00 0 0 2 TX: bytes packets errors dropped carrier collsns 1406 17 0 0 0 0 vf 0 MAC 00:00:ca:fe:ca:fe, vlan 5, spoof checking off, link-state auto, trust off, query_rss off RX: bytes packets mcast bcast dropped 1666 29 14 32 0 TX: bytes packets dropped 2880 44 2412 Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 99 +- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 7 ++ drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 2 + .../net/ethernet/mellanox/mlx5/core/fs_counters.c | 6 ++ 4 files changed, 112 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 7649e36653d9..5ecf2cddc16d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -37,6 +37,7 @@ #include #include "mlx5_core.h" #include "eswitch.h" +#include "fs_core.h" #define UPLINK_VPORT 0x @@ -1123,8 +1124,12 @@ static void esw_vport_disable_ingress_acl(struct mlx5_eswitch *esw, static int esw_vport_ingress_config(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { + struct mlx5_fc *counter = vport->ingress.drop_counter; + struct mlx5_flow_destination drop_ctr_dst = {0}; + struct mlx5_flow_destination *dst = NULL; struct mlx5_flow_act flow_act = {0}; struct mlx5_flow_spec *spec; + int dest_num = 0; int err = 0; u8 *smac_v; @@ -1188,9 +1193,18 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw, memset(spec, 0, sizeof(*spec)); flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + + /* Attach drop flow counter */ + if (counter) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + drop_ctr_dst.counter = counter; + dst = &drop_ctr_dst; + dest_num++; + } vport->ingress.drop_rule = mlx5_add_flow_rules(vport->ingress.acl, spec, - &flow_act, NULL, 0); + &flow_act, dst, dest_num); if (IS_ERR(vport->ingress.drop_rule)) { err = PTR_ERR(vport->ingress.drop_rule); esw_warn(esw->dev, @@ -1210,8 +1224,12 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw, static int esw_vport_egress_config(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { + struct mlx5_fc *counter = vport->egress.drop_counter; + struct mlx5_flow_destination drop_ctr_dst = {0}; + struct mlx5_flow_destination *dst = NULL; struct mlx5_flow_act flow_act = {0}; struct mlx5_flow_spec *spec; + int dest_num = 0; int err = 0; esw_vport_cleanup_egress_rules(esw, vport); @@ -1262,9 +1280,18 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw, /* Drop others rule (star rule) */ memset(spec, 0, sizeof(*spec)); flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + + /* Attach egress drop flow counter */ + if (counter) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + drop_ctr_dst.counter = counter; + dst = &drop_ctr_dst; + dest_num++; + } vport->egress.drop_rule = mlx5_add_flow_rules(vport->egress.acl, spec, - &flow_act, NULL, 0); + &flow_act, dst, dest_num); if (IS_ERR(vport->egress.drop_rule)) { err = PTR_ERR(vport->egress.drop_rule); esw_warn(esw->dev, @@ -1457,6 +1484,41 @@ static void esw_apply_vport_conf(struct mlx5_eswitch *esw, } } +static void esw_vport_create_drop_counters(struct mlx5_vport *vport) +{ + struct mlx5_core_dev *dev = vport->dev; + + if (MLX5_CAP_ESW_INGRESS_ACL(dev, flow_counter)) { + vport->ingress.drop_counter = mlx5_fc_create(dev, false); + if (IS_ERR(vport->ingress.drop_counter)) { +
[net-next 01/10] net/mlx5: Add hairpin definitions to the FW API
From: Or Gerlitz Add hairpin definitions to the IFC file. This includes the HCA ID, few HCA hairpin capabilities, new fields in RQ/SQ used later for the pairing and the WQ hairpin data size attribute. Signed-off-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 43 +++ 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index d44ec5f41d4a..78e36fc2609e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -794,7 +794,10 @@ enum { }; struct mlx5_ifc_cmd_hca_cap_bits { - u8 reserved_at_0[0x80]; + u8 reserved_at_0[0x30]; + u8 vhca_id[0x10]; + + u8 reserved_at_40[0x40]; u8 log_max_srq_sz[0x8]; u8 log_max_qp_sz[0x8]; @@ -1023,12 +1026,19 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_3b8[0x3]; u8 log_min_stride_sz_sq[0x5]; - u8 reserved_at_3c0[0x1b]; + u8 hairpin[0x1]; + u8 reserved_at_3c1[0x2]; + u8 log_max_hairpin_queues[0x5]; + u8 reserved_at_3c8[0x3]; + u8 log_max_hairpin_wq_data_sz[0x5]; + u8 reserved_at_3d0[0xb]; u8 log_max_wq_sz[0x5]; u8 nic_vport_change_event[0x1]; u8 disable_local_lb[0x1]; - u8 reserved_at_3e2[0x9]; + u8 reserved_at_3e2[0x1]; + u8 log_min_hairpin_wq_data_sz[0x5]; + u8 reserved_at_3e8[0x3]; u8 log_max_vlan_list[0x5]; u8 reserved_at_3f0[0x3]; u8 log_max_current_mc_list[0x5]; @@ -1162,7 +1172,10 @@ struct mlx5_ifc_wq_bits { u8 reserved_at_118[0x3]; u8 log_wq_sz[0x5]; - u8 reserved_at_120[0x15]; + u8 reserved_at_120[0xb]; + u8 log_hairpin_data_sz[0x5]; + u8 reserved_at_130[0x5]; + u8 log_wqe_num_of_strides[0x3]; u8 two_byte_shift_en[0x1]; u8 reserved_at_139[0x4]; @@ -2482,7 +2495,8 @@ struct mlx5_ifc_sqc_bits { u8 state[0x4]; u8 reg_umr[0x1]; u8 allow_swp[0x1]; - u8 reserved_at_e[0x12]; + u8 hairpin[0x1]; + u8 reserved_at_f[0x11]; u8 reserved_at_20[0x8]; u8 user_index[0x18]; @@ -2490,7 +2504,13 @@ struct mlx5_ifc_sqc_bits { u8 reserved_at_40[0x8]; u8 cqn[0x18]; - u8 reserved_at_60[0x90]; + u8 reserved_at_60[0x8]; + u8 hairpin_peer_rq[0x18]; + + u8 reserved_at_80[0x10]; + u8 hairpin_peer_vhca[0x10]; + + u8 reserved_at_a0[0x50]; u8 packet_pacing_rate_limit_index[0x10]; u8 tis_lst_sz[0x10]; @@ -2562,7 +2582,8 @@ struct mlx5_ifc_rqc_bits { u8 state[0x4]; u8 reserved_at_c[0x1]; u8 flush_in_error_en[0x1]; - u8 reserved_at_e[0x12]; + u8 hairpin[0x1]; + u8 reserved_at_f[0x11]; u8 reserved_at_20[0x8]; u8 user_index[0x18]; @@ -2576,7 +2597,13 @@ struct mlx5_ifc_rqc_bits { u8 reserved_at_80[0x8]; u8 rmpn[0x18]; - u8 reserved_at_a0[0xe0]; + u8 reserved_at_a0[0x8]; + u8 hairpin_peer_sq[0x18]; + + u8 reserved_at_c0[0x10]; + u8 hairpin_peer_vhca[0x10]; + + u8 reserved_at_e0[0xa0]; struct mlx5_ifc_wq_bits wq; }; -- 2.13.0
RE: [patch iproute2 v6 1/3] lib/libnetlink: Add a function rtnl_talk_msg
> -Original Message- > From: David Ahern [mailto:dsah...@gmail.com] > Sent: Saturday, January 6, 2018 1:51 AM > To: Chris Mi ; netdev@vger.kernel.org > Cc: gerlitz...@gmail.com; step...@networkplumber.org; > marcelo.leit...@gmail.com > Subject: Re: [patch iproute2 v6 1/3] lib/libnetlink: Add a function > rtnl_talk_msg > > On 1/4/18 12:34 AM, Chris Mi wrote: > > rtnl_talk can only send a single message to kernel. Add a new function > > rtnl_talk_msg that can send multiple messages to kernel. > > > > Signed-off-by: Chris Mi > > --- > > include/libnetlink.h | 3 +++ > > lib/libnetlink.c | 66 ++ > -- > > 2 files changed, 51 insertions(+), 18 deletions(-) > > > > I think you should add an argument to rtnl_talk_msg to return the number of > messages processed. That can be used to refine which line failed. As batch > size increases the current design puts the burden on the user to scan a lot of > lines to find the one that fails: > > tc -b tc.batch -bs 50 > RTNETLINK answers: File exists > We have an error talking to the kernel, -1 Command failed tc.batch:2-51 > > We should be able to tell them exactly which line failed. Done. > > Also, it would be better to call this rtnl_talk_iov, take an iov as an > argument > and have a common rtnl_talk_msg for existing code and this new one. > > As it stands you are having to add: >struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; > > to tc functions when it really only needs to know about iov's. Done.
Re: iscsi target regression due to "tcp: remove prequeue support" patch
Hi MNC & Florian, (Adding net-dev + DaveM CC') Catching up on pre-holiday threads, thanks for the heads up. Comments below. On Wed, 2017-12-13 at 23:56 -0600, Mike Christie wrote: > Hey Nick and Florian, > > Starting in 4.14 iscsi logins will fail around 50% of the time. > > I git bisected the issue down to this commit: > > commit e7942d0633c47c791ece6afa038be9cf977226de > Author: Florian Westphal > Date: Sun Jul 30 03:57:18 2017 +0200 > > tcp: remove prequeue support > > Nick, attached is the iscsi target log info when the login fails. > > You can see at: > > Dec 13 17:55:01 rhel73n1 kernel: Got Login Command, Flags 0x81, ITT: > 0x, CmdSN: 0x, ExpStatSN: 0xf86dc69b, CID: 0, Length: 65 > > we have got a login command and we seem to then go into > iscsit_do_rx_data -> sock_recvmsg > > We seem to get stuck in there though, because we stay blocked until: > > Dec 13 17:55:01 rhel73n1 kernel: Entering iscsi_target_sk_data_ready: > conn: 88b35cbb3000 > Dec 13 17:55:01 rhel73n1 kernel: Got LOGIN_FLAGS_READ_ACTIVE=1, conn: > 88b35cbb3000 > > where initiator side timeout fires 15 seconds later and it disconnects > the tcp connection, and we eventually break out of the recvmsg call: > > Dec 13 17:55:16 rhel73n1 kernel: Entering iscsi_target_sk_state_change > Dec 13 17:55:16 rhel73n1 kernel: __iscsi_target_sk_check_close: > TCP_CLOSE_WAIT|TCP_CLOSE,returning FALSE > > > > Dec 13 17:55:16 rhel73n1 kernel: rx_loop: 68, total_rx: 68, data: 68 > Dec 13 17:55:16 rhel73n1 kernel: iscsi_target_do_login_rx after > rx_login_io, 88b35cbb3000, kworker/2:2:1829 > Ok, the 3rd third login request payload (65 + 3 padded to 68 bytes) containing CHAP_N + CHAP_R keys remains blocked on sock_recvmsg(), until TPG login_timeout subsequently fires after 15 seconds of inactivity to terminate this login attempt. > Is the iscsi target doing something incorrect in its use of > sk_data_ready and sock_recvmsg or is the tcp patch at fault? >From the logs, sk_data_ready() -> iscsi_target_sk_data_ready() callbacks appear firing as expected. iscsi-target login does iscsit_rx_do_data() -> rx_data() -> sock_recvmsg(..., MSG_WAITALL) from a system_wq kworker process context after iscsi_target_sk_data_ready() callback queues up iscsi_conn->login_work for execution, and sock_recvmsg() uses a single struct kvec iovec for struct msg_hdr. AFAICT, iscsi-target uses blocking kernel socket reads from process context, similar to kernel_recvmsg(..., MSG_WAITALL) with DRBD. Florian + DaveM, any idea why the removal of prequeue support is having an effect here..?
[PATCH net-next 2/2] net: hns3: report the function type the same line with hns3_nic_get_stats64
The function type should be on the same line with the function name, or it may cause display error if a patch edit the function. There is am example following: https://www.spinics.net/lists/netdev/msg476141.html Signed-off-by: Peng Li --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index b23107d..14c7625 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1126,8 +1126,8 @@ static int hns3_nic_set_features(struct net_device *netdev, return 0; } -static void -hns3_nic_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats) +static void hns3_nic_get_stats64(struct net_device *netdev, +struct rtnl_link_stats64 *stats) { struct hns3_nic_priv *priv = netdev_priv(netdev); int queue_num = priv->ae_handle->kinfo.num_tqps; -- 1.9.1
[PATCH net-next 0/2] code improvements in HNS3 driver
This patchset fixes 2 comments for community review. [patch 1/2] reverts "net: hns3: Add packet statistics of netdev" reported by Jakub Kicinski and David Miller. [patch 2/2] reports the function type the same line with hns3_nic_get_stats64, reported by Andrew Lunn. Peng Li (2): Revert "net: hns3: Add packet statistics of netdev" net: hns3: report the function type the same line with hns3_nic_get_stats64 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 4 +- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 80 +- 2 files changed, 3 insertions(+), 81 deletions(-) -- 1.9.1
[PATCH net-next 1/2] Revert "net: hns3: Add packet statistics of netdev"
This reverts commit 8491000754796c838a0081c267f9dd54ad2ccba3. It is duplicate to add statistics of netdev for ethtool -S. Signed-off-by: Peng Li --- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 80 +- 1 file changed, 1 insertion(+), 79 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 1e8fac3..d3cb3ec 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -59,41 +59,6 @@ struct hns3_stats { #define HNS3_TQP_STATS_COUNT (HNS3_TXQ_STATS_COUNT + HNS3_RXQ_STATS_COUNT) -/* netdev stats */ -#define HNS3_NETDEV_STAT(_string, _member) { \ - .stats_string = _string,\ - .stats_offset = offsetof(struct rtnl_link_stats64, _member) \ -} - -static const struct hns3_stats hns3_netdev_stats[] = { - /* Rx per-queue statistics */ - HNS3_NETDEV_STAT("rx_packets", rx_packets), - HNS3_NETDEV_STAT("tx_packets", tx_packets), - HNS3_NETDEV_STAT("rx_bytes", rx_bytes), - HNS3_NETDEV_STAT("tx_bytes", tx_bytes), - HNS3_NETDEV_STAT("rx_errors", rx_errors), - HNS3_NETDEV_STAT("tx_errors", tx_errors), - HNS3_NETDEV_STAT("rx_dropped", rx_dropped), - HNS3_NETDEV_STAT("tx_dropped", tx_dropped), - HNS3_NETDEV_STAT("multicast", multicast), - HNS3_NETDEV_STAT("collisions", collisions), - HNS3_NETDEV_STAT("rx_length_errors", rx_length_errors), - HNS3_NETDEV_STAT("rx_over_errors", rx_over_errors), - HNS3_NETDEV_STAT("rx_crc_errors", rx_crc_errors), - HNS3_NETDEV_STAT("rx_frame_errors", rx_frame_errors), - HNS3_NETDEV_STAT("rx_fifo_errors", rx_fifo_errors), - HNS3_NETDEV_STAT("rx_missed_errors", rx_missed_errors), - HNS3_NETDEV_STAT("tx_aborted_errors", tx_aborted_errors), - HNS3_NETDEV_STAT("tx_carrier_errors", tx_carrier_errors), - HNS3_NETDEV_STAT("tx_fifo_errors", tx_fifo_errors), - HNS3_NETDEV_STAT("tx_heartbeat_errors", tx_heartbeat_errors), - HNS3_NETDEV_STAT("tx_window_errors", tx_window_errors), - HNS3_NETDEV_STAT("rx_compressed", rx_compressed), - HNS3_NETDEV_STAT("tx_compressed", tx_compressed), -}; - -#define HNS3_NETDEV_STATS_COUNT ARRAY_SIZE(hns3_netdev_stats) - #define HNS3_SELF_TEST_TPYE_NUM1 #define HNS3_NIC_LB_TEST_PKT_NUM 1 #define HNS3_NIC_LB_TEST_RING_ID 0 @@ -466,27 +431,6 @@ static u8 *hns3_get_strings_tqps(struct hnae3_handle *handle, u8 *data) return data; } -static u8 *hns3_netdev_stats_get_strings(u8 *data) -{ - int i; - - /* get strings for netdev */ - for (i = 0; i < HNS3_NETDEV_STATS_COUNT; i++) { - snprintf(data, ETH_GSTRING_LEN, -hns3_netdev_stats[i].stats_string); - data += ETH_GSTRING_LEN; - } - - snprintf(data, ETH_GSTRING_LEN, "netdev_rx_dropped"); - data += ETH_GSTRING_LEN; - snprintf(data, ETH_GSTRING_LEN, "netdev_tx_dropped"); - data += ETH_GSTRING_LEN; - snprintf(data, ETH_GSTRING_LEN, "netdev_tx_timeout"); - data += ETH_GSTRING_LEN; - - return data; -} - static void hns3_get_strings(struct net_device *netdev, u32 stringset, u8 *data) { struct hnae3_handle *h = hns3_get_handle(netdev); @@ -498,7 +442,6 @@ static void hns3_get_strings(struct net_device *netdev, u32 stringset, u8 *data) switch (stringset) { case ETH_SS_STATS: - buff = hns3_netdev_stats_get_strings(buff); buff = hns3_get_strings_tqps(h, buff); h->ae_algo->ops->get_strings(h, stringset, (u8 *)buff); break; @@ -537,27 +480,6 @@ static u64 *hns3_get_stats_tqps(struct hnae3_handle *handle, u64 *data) return data; } -static u64 *hns3_get_netdev_stats(struct net_device *netdev, u64 *data) -{ - struct hns3_nic_priv *priv = netdev_priv(netdev); - const struct rtnl_link_stats64 *net_stats; - struct rtnl_link_stats64 temp; - u8 *stat; - int i; - - net_stats = dev_get_stats(netdev, &temp); - for (i = 0; i < HNS3_NETDEV_STATS_COUNT; i++) { - stat = (u8 *)net_stats + hns3_netdev_stats[i].stats_offset; - *data++ = *(u64 *)stat; - } - - *data++ = netdev->rx_dropped.counter; - *data++ = netdev->tx_dropped.counter; - *data++ = priv->tx_timeout_count; - - return data; -} - /* hns3_get_stats - get detail statistics. * @netdev: net device * @stats: statistics info. @@ -574,7 +496,7 @@ static void hns3_get_stats(struct net_device *netdev, return; } - p = hns3_get_netdev_stats(netdev, p); + h->ae_algo->ops->update_stats(h, &netdev->stats); /* get per-queue stats */ p = hns3_get_stats_tqps(h, p); -- 1.9.1
[PATCH net-next v3 06/10] net/mlx5e: Change Mellanox references in DIM code
From: Andy Gospodarek Change all appropriate mlx5_am* and MLX5_AM* references to net_dim and NET_DIM, respectively, in code that handles dynamic interrupt moderation. Also change all references from 'am' to 'dim' when used as local variables and add generic profile references. Signed-off-by: Andy Gospodarek Acked-by: Tal Gilboa Acked-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 9 +- drivers/net/ethernet/mellanox/mlx5/core/en_dim.c | 14 +- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 6 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 40 ++- drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 8 +- drivers/net/ethernet/mellanox/mlx5/core/net_dim.c | 286 ++--- drivers/net/ethernet/mellanox/mlx5/core/net_dim.h | 63 ++--- 7 files changed, 225 insertions(+), 201 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 4ee06e7..4d1d298 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -238,8 +238,8 @@ struct mlx5e_params { u16 num_channels; u8 num_tc; bool rx_cqe_compress_def; - struct mlx5e_cq_moder rx_cq_moderation; - struct mlx5e_cq_moder tx_cq_moderation; + struct net_dim_cq_moder rx_cq_moderation; + struct net_dim_cq_moder tx_cq_moderation; bool lro_en; u32 lro_wqe_sz; u16 tx_max_inline; @@ -249,7 +249,7 @@ struct mlx5e_params { u32 indirection_rqt[MLX5E_INDIR_RQT_SIZE]; bool vlan_strip_disable; bool scatter_fcs_en; - bool rx_am_enabled; + bool rx_dim_enabled; u32 lro_timeout; u32 pflags; struct bpf_prog *xdp_prog; @@ -528,7 +528,7 @@ struct mlx5e_rq { unsigned long state; intix; - struct mlx5e_rx_am am; /* Adaptive Moderation */ + struct net_dim dim; /* Dynamic Interrupt Moderation */ /* XDP */ struct bpf_prog *xdp_prog; @@ -1079,4 +1079,5 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params, u16 max_channels); u8 mlx5e_params_calculate_tx_min_inline(struct mlx5_core_dev *mdev); +void mlx5e_rx_dim_work(struct work_struct *work); #endif /* __MLX5_EN_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c index b9b434b..f620325 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c @@ -32,17 +32,17 @@ #include "en.h" -void mlx5e_rx_am_work(struct work_struct *work) +void mlx5e_rx_dim_work(struct work_struct *work) { - struct mlx5e_rx_am *am = container_of(work, struct mlx5e_rx_am, - work); - struct mlx5e_rq *rq = container_of(am, struct mlx5e_rq, am); - struct mlx5e_cq_moder cur_profile = mlx5e_am_get_profile(am->mode, - am->profile_ix); + struct net_dim *dim = container_of(work, struct net_dim, + work); + struct mlx5e_rq *rq = container_of(dim, struct mlx5e_rq, dim); + struct net_dim_cq_moder cur_profile = net_dim_get_profile(dim->mode, + dim->profile_ix); mlx5_core_modify_cq_moderation(rq->mdev, &rq->cq.mcq, cur_profile.usec, cur_profile.pkts); - am->state = MLX5E_AM_START_MEASURE; + dim->state = NET_DIM_START_MEASURE; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 8f05efa..51ae6df 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -480,7 +480,7 @@ int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv, coal->rx_max_coalesced_frames = priv->channels.params.rx_cq_moderation.pkts; coal->tx_coalesce_usecs = priv->channels.params.tx_cq_moderation.usec; coal->tx_max_coalesced_frames = priv->channels.params.tx_cq_moderation.pkts; - coal->use_adaptive_rx_coalesce = priv->channels.params.rx_am_enabled; + coal->use_adaptive_rx_coalesce = priv->channels.params.rx_dim_enabled; return 0; } @@ -534,7 +534,7 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv, new_channels.params.tx_cq_moderation.pkts = coal->tx_max_coalesced_frames; new_channels.params.rx_cq_moderation.usec = coal->rx_coalesce_usecs; new_channels.params.rx_cq_moderation.pkts = coal->rx_max_coalesced_frames; - new_channels.params.rx_am_enabled = !!coal->use_adaptive_rx_coalesce; + new_channels.params.rx_dim_enabled
[PATCH net-next v3 05/10] net/mlx5e: Move generic functions to new file
From: Andy Gospodarek These functions were identified as ones that could be made generic and used by multiple drivers. Most of the contents of en_rx_am.c are moved to net_dim.c. Signed-off-by: Andy Gospodarek Acked-by: Tal Gilboa Acked-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 4 +- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_dim.c | 48 drivers/net/ethernet/mellanox/mlx5/core/en_dim.h | 102 --- drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c | 320 - drivers/net/ethernet/mellanox/mlx5/core/net_dim.c | 307 drivers/net/ethernet/mellanox/mlx5/core/net_dim.h | 103 +++ 7 files changed, 461 insertions(+), 425 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_dim.h delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/net_dim.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/net_dim.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 19b21b4..b46b6de2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -14,8 +14,8 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o \ fpga/ipsec.o mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \ - en_tx.o en_rx.o en_rx_am.o en_txrx.o en_stats.o vxlan.o \ - en_arfs.o en_fs_ethtool.o en_selftest.o + en_tx.o en_rx.o en_dim.o en_txrx.o en_stats.o vxlan.o \ + en_arfs.o en_fs_ethtool.o en_selftest.o net_dim.o mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index e2e35ed..4ee06e7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -50,7 +50,7 @@ #include "wq.h" #include "mlx5_core.h" #include "en_stats.h" -#include "en_dim.h" +#include "net_dim.h" #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c new file mode 100644 index 000..b9b434b --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + *copyright notice, this list of conditions and the following + *disclaimer. + * + * - Redistributions in binary form must reproduce the above + *copyright notice, this list of conditions and the following + *disclaimer in the documentation and/or other materials + *provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "en.h" + +void mlx5e_rx_am_work(struct work_struct *work) +{ + struct mlx5e_rx_am *am = container_of(work, struct mlx5e_rx_am, + work); + struct mlx5e_rq *rq = container_of(am, struct mlx5e_rq, am); + struct mlx5e_cq_moder cur_profile = mlx5e_am_get_profile(am->mode, + am->profile_ix); + + mlx5_core_modify_cq_moderation(rq->mdev, &rq->cq.mcq, + cur_profile.usec, cur_profile.pkts); + + am->state = MLX5E_AM_START_MEASURE; +} + diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h deleted file mode 100644 index a1497bab..000 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2013-2015, Mellanox Technologies
[PATCH net-next v3 03/10] net/mlx5e: Remove rq references in mlx5e_rx_am
From: Andy Gospodarek This makes mlx5e_am_sample more generic so that it can be called easily from a driver that does not use the same data structure to store these values in a single structure. Signed-off-by: Andy Gospodarek Acked-by: Tal Gilboa Acked-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_dim.h | 6 -- drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c | 22 +- drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 5 - 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h index 2031a21..7d5499a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h @@ -66,8 +66,10 @@ struct mlx5e_rx_am { /* Adaptive Moderation */ u8 tired; }; -struct mlx5e_rq; -void mlx5e_rx_am(struct mlx5e_rq *rq); +void mlx5e_rx_am(struct mlx5e_rx_am *am, +u16 event_ctr, +u64 packets, +u64 bytes); void mlx5e_rx_am_work(struct work_struct *work); struct mlx5e_cq_moder mlx5e_am_get_def_profile(u8 rx_cq_period_mode); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c index e401d9d..1630076 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c @@ -264,13 +264,15 @@ static bool mlx5e_am_decision(struct mlx5e_rx_am_stats *curr_stats, return am->profile_ix != prev_ix; } -static void mlx5e_am_sample(struct mlx5e_rq *rq, +static void mlx5e_am_sample(u16 event_ctr, + u64 packets, + u64 bytes, struct mlx5e_rx_am_sample *s) { s->time = ktime_get(); - s->pkt_ctr = rq->stats.packets; - s->byte_ctr = rq->stats.bytes; - s->event_ctr = rq->cq.event_ctr; + s->pkt_ctr = packets; + s->byte_ctr = bytes; + s->event_ctr = event_ctr; } #define MLX5E_AM_NEVENTS 64 @@ -309,20 +311,22 @@ void mlx5e_rx_am_work(struct work_struct *work) am->state = MLX5E_AM_START_MEASURE; } -void mlx5e_rx_am(struct mlx5e_rq *rq) +void mlx5e_rx_am(struct mlx5e_rx_am *am, +u16 event_ctr, +u64 packets, +u64 bytes) { - struct mlx5e_rx_am *am = &rq->am; struct mlx5e_rx_am_sample end_sample; struct mlx5e_rx_am_stats curr_stats; u16 nevents; switch (am->state) { case MLX5E_AM_MEASURE_IN_PROGRESS: - nevents = BIT_GAP(BITS_PER_TYPE(u16), rq->cq.event_ctr, + nevents = BIT_GAP(BITS_PER_TYPE(u16), event_ctr, am->start_sample.event_ctr); if (nevents < MLX5E_AM_NEVENTS) break; - mlx5e_am_sample(rq, &end_sample); + mlx5e_am_sample(event_ctr, packets, bytes, &end_sample); mlx5e_am_calc_stats(&am->start_sample, &end_sample, &curr_stats); if (mlx5e_am_decision(&curr_stats, am)) { @@ -332,7 +336,7 @@ void mlx5e_rx_am(struct mlx5e_rq *rq) } /* fall through */ case MLX5E_AM_START_MEASURE: - mlx5e_am_sample(rq, &am->start_sample); + mlx5e_am_sample(event_ctr, packets, bytes, &am->start_sample); am->state = MLX5E_AM_MEASURE_IN_PROGRESS; break; case MLX5E_AM_APPLY_NEW_PROFILE: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c index ab92298..1849169 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c @@ -79,7 +79,10 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) mlx5e_cq_arm(&c->sq[i].cq); if (MLX5E_TEST_BIT(c->rq.state, MLX5E_RQ_STATE_AM)) - mlx5e_rx_am(&c->rq); + mlx5e_rx_am(&c->rq.am, + c->rq.cq.event_ctr, + c->rq.stats.packets, + c->rq.stats.bytes); mlx5e_cq_arm(&c->rq.cq); mlx5e_cq_arm(&c->icosq.cq); -- 2.7.4
[PATCH net-next v3 02/10] net/mlx5e: Move interrupt moderation forward declarations
From: Andy Gospodarek Move these to newly created file to prepare to move these functions to a library. Signed-off-by: Andy Gospodarek Acked-by: Tal Gilboa Acked-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 drivers/net/ethernet/mellanox/mlx5/core/en_dim.h | 5 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index df9cbb3..e2e35ed 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -833,10 +833,6 @@ void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix); void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix); void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi); -void mlx5e_rx_am(struct mlx5e_rq *rq); -void mlx5e_rx_am_work(struct work_struct *work); -struct mlx5e_cq_moder mlx5e_am_get_def_profile(u8 rx_cq_period_mode); - void mlx5e_update_stats(struct mlx5e_priv *priv, bool full); int mlx5e_create_flow_steering(struct mlx5e_priv *priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h index 9eeaa11..2031a21 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h @@ -66,4 +66,9 @@ struct mlx5e_rx_am { /* Adaptive Moderation */ u8 tired; }; +struct mlx5e_rq; +void mlx5e_rx_am(struct mlx5e_rq *rq); +void mlx5e_rx_am_work(struct work_struct *work); +struct mlx5e_cq_moder mlx5e_am_get_def_profile(u8 rx_cq_period_mode); + #endif /* MLX5_AM_H */ -- 2.7.4
[PATCH net-next v3 10/10] MAINTAINERS: add entry for Dynamic Interrupt Moderation
From: Andy Gospodarek Signed-off-by: Andy Gospodarek Signed-off-by: Tal Gilboa Acked-by: Saeed Mahameed --- MAINTAINERS | 5 + 1 file changed, 5 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 753799d..178239dc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4944,6 +4944,11 @@ S: Maintained F: lib/dynamic_debug.c F: include/linux/dynamic_debug.h +DYNAMIC INTERRUPT MODERATION +M: Tal Gilboa +S: Maintained +F: include/linux/net_dim.h + DZ DECSTATION DZ11 SERIAL DRIVER M: "Maciej W. Rozycki" S: Maintained -- 2.7.4
[PATCH net-next v3 08/10] net/dim: use struct net_dim_sample as arg to net_dim
From: Andy Gospodarek Simplify the arguments net_dim() by formatting them into a struct net_dim_sample before calling the function. Signed-off-by: Andy Gospodarek Suggested-by: Tal Gilboa Acked-by: Tal Gilboa Acked-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 13 - include/linux/net_dim.h | 10 +++--- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c index a1c94fd..f292bb3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c @@ -78,11 +78,14 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) for (i = 0; i < c->num_tc; i++) mlx5e_cq_arm(&c->sq[i].cq); - if (MLX5E_TEST_BIT(c->rq.state, MLX5E_RQ_STATE_AM)) - net_dim(&c->rq.dim, - c->rq.cq.event_ctr, - c->rq.stats.packets, - c->rq.stats.bytes); + if (MLX5E_TEST_BIT(c->rq.state, MLX5E_RQ_STATE_AM)) { + struct net_dim_sample dim_sample; + net_dim_sample(c->rq.cq.event_ctr, + c->rq.stats.packets, + c->rq.stats.bytes, + &dim_sample); + net_dim(&c->rq.dim, dim_sample); + } mlx5e_cq_arm(&c->rq.cq); mlx5e_cq_arm(&c->icosq.cq); diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h index 741510f..1c7e450 100644 --- a/include/linux/net_dim.h +++ b/include/linux/net_dim.h @@ -342,21 +342,18 @@ static inline void net_dim_calc_stats(struct net_dim_sample *start, } static inline void net_dim(struct net_dim *dim, - u16 event_ctr, - u64 packets, - u64 bytes) + struct net_dim_sample end_sample) { - struct net_dim_sample end_sample; struct net_dim_stats curr_stats; u16 nevents; switch (dim->state) { case NET_DIM_MEASURE_IN_PROGRESS: - nevents = BIT_GAP(BITS_PER_TYPE(u16), event_ctr, + nevents = BIT_GAP(BITS_PER_TYPE(u16), + end_sample.event_ctr, dim->start_sample.event_ctr); if (nevents < NET_DIM_NEVENTS) break; - net_dim_sample(event_ctr, packets, bytes, &end_sample); net_dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats); if (net_dim_decision(&curr_stats, dim)) { @@ -366,7 +363,6 @@ static inline void net_dim(struct net_dim *dim, } /* fall through */ case NET_DIM_START_MEASURE: - net_dim_sample(event_ctr, packets, bytes, &dim->start_sample); dim->state = NET_DIM_MEASURE_IN_PROGRESS; break; case NET_DIM_APPLY_NEW_PROFILE: -- 2.7.4
[PATCH net-next v3 01/10] net/mlx5e: Move interrupt moderation structs to new file
From: Andy Gospodarek Create new header file to prepare to move code that handles irq moderation to a library that lives in a header file. Signed-off-by: Andy Gospodarek Acked-by: Tal Gilboa Acked-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 33 +--- drivers/net/ethernet/mellanox/mlx5/core/en_dim.h | 69 2 files changed, 70 insertions(+), 32 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_dim.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 5299310..df9cbb3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -50,6 +50,7 @@ #include "wq.h" #include "mlx5_core.h" #include "en_stats.h" +#include "en_dim.h" #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) @@ -227,12 +228,6 @@ enum mlx5e_priv_flag { #define MLX5E_MAX_BW_ALLOC 100 /* Max percentage of BW allocation */ #endif -struct mlx5e_cq_moder { - u16 usec; - u16 pkts; - u8 cq_period_mode; -}; - struct mlx5e_params { u8 log_sq_size; u8 rq_wq_type; @@ -473,32 +468,6 @@ struct mlx5e_mpw_info { u16 skbs_frags[MLX5_MPWRQ_PAGES_PER_WQE]; }; -struct mlx5e_rx_am_stats { - int ppms; /* packets per msec */ - int bpms; /* bytes per msec */ - int epms; /* events per msec */ -}; - -struct mlx5e_rx_am_sample { - ktime_t time; - u32 pkt_ctr; - u32 byte_ctr; - u16 event_ctr; -}; - -struct mlx5e_rx_am { /* Adaptive Moderation */ - u8 state; - struct mlx5e_rx_am_statsprev_stats; - struct mlx5e_rx_am_sample start_sample; - struct work_struct work; - u8 profile_ix; - u8 mode; - u8 tune_state; - u8 steps_right; - u8 steps_left; - u8 tired; -}; - /* a single cache unit is capable to serve one napi call (for non-striding rq) * or a MPWQE (for striding rq). */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h new file mode 100644 index 000..9eeaa11 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * Copyright (c) 2017-2018, Broadcom Limited + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + *copyright notice, this list of conditions and the following + *disclaimer. + * + * - Redistributions in binary form must reproduce the above + *copyright notice, this list of conditions and the following + *disclaimer in the documentation and/or other materials + *provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_AM_H +#define MLX5_AM_H + +struct mlx5e_cq_moder { + u16 usec; + u16 pkts; + u8 cq_period_mode; +}; + +struct mlx5e_rx_am_sample { + ktime_t time; + u32 pkt_ctr; + u32 byte_ctr; + u16 event_ctr; +}; + +struct mlx5e_rx_am_stats { + int ppms; /* packets per msec */ + int bpms; /* bytes per msec */ + int epms; /* events per msec */ +}; + +struct mlx5e_rx_am { /* Adaptive Moderation */ + u8 state; + struct mlx5e_rx_am_statsprev_stats; + struct mlx5e_rx_am_sample start_sample; + struct work_struct work; + u8 profile_ix; + u8 mode; + u8 tune_state; + u
[PATCH net-next v3 07/10] net/mlx5e: Move dynamic interrupt coalescing code to include/linux
From: Andy Gospodarek This move allows drivers to add private structure elements to track the number of packets, bytes, and interrupts events per ring. A driver also defines a workqueue handler to act on this collected data once per poll and modify the coalescing parameters per ring. Signed-off-by: Andy Gospodarek Acked-by: Tal Gilboa Acked-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_dim.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/net_dim.c | 307 -- drivers/net/ethernet/mellanox/mlx5/core/net_dim.h | 108 --- include/linux/net_dim.h | 377 ++ 6 files changed, 380 insertions(+), 417 deletions(-) delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/net_dim.c delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/net_dim.h create mode 100644 include/linux/net_dim.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index b46b6de2..c805769 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -15,7 +15,7 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o \ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \ en_tx.o en_rx.o en_dim.o en_txrx.o en_stats.o vxlan.o \ - en_arfs.o en_fs_ethtool.o en_selftest.o net_dim.o + en_arfs.o en_fs_ethtool.o en_selftest.o mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 4d1d298..29b9675 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -47,10 +47,10 @@ #include #include #include +#include #include "wq.h" #include "mlx5_core.h" #include "en_stats.h" -#include "net_dim.h" #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c index f620325..2b89951 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c @@ -30,6 +30,7 @@ * SOFTWARE. */ +#include #include "en.h" void mlx5e_rx_dim_work(struct work_struct *work) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/net_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/net_dim.c deleted file mode 100644 index decb370..000 --- a/drivers/net/ethernet/mellanox/mlx5/core/net_dim.c +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Copyright (c) 2016, Mellanox Technologies. All rights reserved. - * Copyright (c) 2017-2018, Broadcom Limited. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - *copyright notice, this list of conditions and the following - *disclaimer. - * - * - Redistributions in binary form must reproduce the above - *copyright notice, this list of conditions and the following - *disclaimer in the documentation and/or other materials - *provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "en.h" - -#define NET_DIM_PARAMS_NUM_PROFILES 5 -/* Adaptive moderation profiles */ -#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256 -#define NET_DIM_DEF_PROFILE_CQE 1 -#define NET_DIM_DEF_PROFILE_EQE 1 - -/* All profiles sizes must be NET_PARAMS_DIM_NUM_PROFILES */ -#define NET_DIM_EQE_PROFILES { \ - {1, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {8, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {64, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ -} - -#define NET_DIM_CQE_PROFILES
[PATCH net-next v3 09/10] bnxt_en: add support for software dynamic interrupt moderation
From: Andy Gospodarek This implements the changes needed for the bnxt_en driver to add support for dynamic interrupt moderation per ring. This does add additional counters in the receive path, but testing shows that any additional instructions are offset by throughput gain when the default configuration is for low latency. Signed-off-by: Andy Gospodarek Cc: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/Makefile | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 50 +++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 34 ++- drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c | 33 +++ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 12 ++ 5 files changed, 119 insertions(+), 12 deletions(-) create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile b/drivers/net/ethernet/broadcom/bnxt/Makefile index 59c8ec9..7c560d5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/Makefile +++ b/drivers/net/ethernet/broadcom/bnxt/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_BNXT) += bnxt_en.o -bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_vfr.o bnxt_devlink.o +bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_vfr.o bnxt_devlink.o bnxt_dim.o bnxt_en-$(CONFIG_BNXT_FLOWER_OFFLOAD) += bnxt_tc.o diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 89c3c87..cf6ebf1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1645,6 +1645,8 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons, rxr->rx_next_cons = NEXT_RX(cons); next_rx_no_prod: + cpr->rx_packets += 1; + cpr->rx_bytes += len; *raw_cons = tmp_raw_cons; return rc; @@ -1802,6 +1804,7 @@ static irqreturn_t bnxt_msix(int irq, void *dev_instance) struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring; u32 cons = RING_CMP(cpr->cp_raw_cons); + cpr->event_ctr++; prefetch(&cpr->cp_desc_ring[CP_RING(cons)][CP_IDX(cons)]); napi_schedule(&bnapi->napi); return IRQ_HANDLED; @@ -2025,6 +2028,15 @@ static int bnxt_poll(struct napi_struct *napi, int budget) break; } } + if (bp->flags & BNXT_FLAG_DIM) { + struct net_dim_sample dim_sample; + + net_dim_sample(cpr->event_ctr, + cpr->rx_packets, + cpr->rx_bytes, + &dim_sample); + net_dim(&cpr->dim, dim_sample); + } mmiowb(); return work_done; } @@ -2617,6 +2629,8 @@ static void bnxt_init_cp_rings(struct bnxt *bp) struct bnxt_ring_struct *ring = &cpr->cp_ring_struct; ring->fw_ring_id = INVALID_HW_RING_ID; + cpr->rx_ring_coal.coal_ticks = bp->rx_coal.coal_ticks; + cpr->rx_ring_coal.coal_bufs = bp->rx_coal.coal_bufs; } } @@ -4593,6 +4607,36 @@ static void bnxt_hwrm_set_coal_params(struct bnxt_coal *hw_coal, req->flags = cpu_to_le16(flags); } +int bnxt_hwrm_set_ring_coal(struct bnxt *bp, struct bnxt_napi *bnapi) +{ + struct hwrm_ring_cmpl_ring_cfg_aggint_params_input req_rx = {0}; + struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring; + struct bnxt_coal coal; + unsigned int grp_idx; + + /* Tick values in micro seconds. +* 1 coal_buf x bufs_per_record = 1 completion record. +*/ + memcpy(&coal, &bp->rx_coal, sizeof(struct bnxt_coal)); + + coal.coal_ticks = cpr->rx_ring_coal.coal_ticks; + coal.coal_bufs = cpr->rx_ring_coal.coal_bufs; + + if (!bnapi->rx_ring) + return -ENODEV; + + bnxt_hwrm_cmd_hdr_init(bp, &req_rx, + HWRM_RING_CMPL_RING_CFG_AGGINT_PARAMS, -1, -1); + + bnxt_hwrm_set_coal_params(&coal, &req_rx); + + grp_idx = bnapi->index; + req_rx.ring_id = cpu_to_le16(bp->grp_info[grp_idx].cp_fw_ring_id); + + return hwrm_send_message(bp, &req_rx, sizeof(req_rx), +HWRM_CMD_TIMEOUT); +} + int bnxt_hwrm_set_coal(struct bnxt *bp) { int i, rc = 0; @@ -5715,7 +5759,13 @@ static void bnxt_enable_napi(struct bnxt *bp) int i; for (i = 0; i < bp->cp_nr_rings; i++) { + struct bnxt_cp_ring_info *cpr = &bp->bnapi[i]->cp_ring; bp->bnapi[i]->in_reset = false; + + if (bp->bnapi[i]->rx_ring) { + INIT_WORK(&cpr->dim.work, bnxt_dim_work); + cpr->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE; + } napi_enable(&bp->bnapi[i]->napi); } } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt
[PATCH net-next v3 04/10] net/mlx5e: Move AM logic enums
From: Andy Gospodarek More movement to help make this code more generic. Signed-off-by: Andy Gospodarek Acked-by: Tal Gilboa Acked-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_dim.h | 26 ++ drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c | 25 - 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h index 7d5499a..a1497bab 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h @@ -66,6 +66,32 @@ struct mlx5e_rx_am { /* Adaptive Moderation */ u8 tired; }; +/* Adaptive moderation logic */ +enum { + MLX5E_AM_START_MEASURE, + MLX5E_AM_MEASURE_IN_PROGRESS, + MLX5E_AM_APPLY_NEW_PROFILE, +}; + +enum { + MLX5E_AM_PARKING_ON_TOP, + MLX5E_AM_PARKING_TIRED, + MLX5E_AM_GOING_RIGHT, + MLX5E_AM_GOING_LEFT, +}; + +enum { + MLX5E_AM_STATS_WORSE, + MLX5E_AM_STATS_SAME, + MLX5E_AM_STATS_BETTER, +}; + +enum { + MLX5E_AM_STEPPED, + MLX5E_AM_TOO_TIRED, + MLX5E_AM_ON_EDGE, +}; + void mlx5e_rx_am(struct mlx5e_rx_am *am, u16 event_ctr, u64 packets, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c index 1630076..337dd60 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c @@ -82,31 +82,6 @@ struct mlx5e_cq_moder mlx5e_am_get_def_profile(u8 rx_cq_period_mode) return mlx5e_am_get_profile(rx_cq_period_mode, default_profile_ix); } -/* Adaptive moderation logic */ -enum { - MLX5E_AM_START_MEASURE, - MLX5E_AM_MEASURE_IN_PROGRESS, - MLX5E_AM_APPLY_NEW_PROFILE, -}; - -enum { - MLX5E_AM_PARKING_ON_TOP, - MLX5E_AM_PARKING_TIRED, - MLX5E_AM_GOING_RIGHT, - MLX5E_AM_GOING_LEFT, -}; - -enum { - MLX5E_AM_STATS_WORSE, - MLX5E_AM_STATS_SAME, - MLX5E_AM_STATS_BETTER, -}; - -enum { - MLX5E_AM_STEPPED, - MLX5E_AM_TOO_TIRED, - MLX5E_AM_ON_EDGE, -}; static bool mlx5e_am_on_top(struct mlx5e_rx_am *am) { -- 2.7.4
[PATCH net-next v3 00/10] net: create dynamic software irq moderation library
From: Andy Gospodarek This converts the dynamic interrupt moderation library from the mlx5e driver into a library so it can be used by any driver. The penultimate patch in this set adds support for thiw new dynamic interrupt moderation library in the bnxt_en driver and the last patch creates an entry in the MAINTAINERS file for this library. The main purpose of this code is to allow an administrator to make sure that default coalesce settings are optimized for low latency, but quickly adapt to handle high throughput/bulk traffic by altering how much time passes before popping an interrupt. For any new driver the following changes would be needed to use this library: - add elements in ring struct to track items needed by this library - create function that can be called to actually set coalesce settings for the driver Credit to Rob Rice and Lee Reed for doing some of the initial proof of concept and testing for this patch and Tal Gilboa and Or Gerlitz for their comments, etc on this set. v3: bnxt_en fix from Michael Chan, comment suggestion from Vasundhara Volam, and small mlx5e header file fix from Tal Gilboa. v2: Spelling fixes from Stephen Hemminger, bnxt_en suggestions from Michael Chan, spelling and formatting fixes from Or Gerlitz, and spelling and mlx5e changes suggested by Tal Gilboa. Andy Gospodarek (10): net/mlx5e: Move interrupt moderation structs to new file net/mlx5e: Move interrupt moderation forward declarations net/mlx5e: Remove rq references in mlx5e_rx_am net/mlx5e: Move AM logic enums net/mlx5e: Move generic functions to new file net/mlx5e: Change Mellanox references in DIM code net/mlx5e: Move dynamic interrupt coalescing code to include/linux net/dim: use struct net_dim_sample as arg to net_dim bnxt_en: add support for software dynamic interrupt moderation MAINTAINERS: add entry for Dynamic Interrupt Moderation MAINTAINERS| 5 + drivers/net/ethernet/broadcom/bnxt/Makefile| 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 50 +++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 34 +- drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c | 33 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 12 + drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en.h | 46 +-- drivers/net/ethernet/mellanox/mlx5/core/en_dim.c | 49 +++ .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 6 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 40 ++- drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c | 341 --- drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 10 +- include/linux/net_dim.h| 373 + 14 files changed, 593 insertions(+), 410 deletions(-) create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c create mode 100644 include/linux/net_dim.h -- 2.7.4
Re: WARNING: held lock freed!
On Mon, Jan 8, 2018 at 8:09 PM, Marcelo Ricardo Leitner wrote: > On Mon, Jan 08, 2018 at 06:01:14PM +0800, Xin Long wrote: >> On Mon, Jan 8, 2018 at 6:58 AM, syzbot >> wrote: >> > Hello, >> > >> > syzkaller hit the following crash on >> > 3219e264b984ec0a13923aa66385819c2e98d582 >> > git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/master >> > compiler: gcc (GCC) 7.1.1 20170620 >> > .config is attached >> > Raw console output is attached. >> > C reproducer is attached >> > syzkaller reproducer is attached. See https://goo.gl/kgGztJ >> > for information about syzkaller reproducers >> > >> > >> > IMPORTANT: if you fix the bug, please add the following tag to the commit: >> > Reported-by: syzbot+ac6ea7baa4432811e...@syzkaller.appspotmail.com >> > It will help syzbot understand when the bug is fixed. See footer for >> > details. >> > If you forward the report, please keep this part and the footer. >> > >> > >> > = >> > WARNING: held lock freed! >> > 4.15.0-rc6+ #250 Not tainted >> > - >> > syzkaller065230/3505 is freeing memory 7fcba654-499fef26, >> > with a lock still held there! >> > (sk_lock-AF_INET6){+.+.}, at: [] lock_sock >> > include/net/sock.h:1461 [inline] >> > (sk_lock-AF_INET6){+.+.}, at: [ ] >> > sctp_wait_for_sndbuf+0x509/0x8d0 net/sctp/socket.c:8042 >> > 1 lock held by syzkaller065230/3505: >> > #0: (sk_lock-AF_INET6){+.+.}, at: [ ] lock_sock >> > include/net/sock.h:1461 [inline] >> > #0: (sk_lock-AF_INET6){+.+.}, at: [ ] >> > sctp_wait_for_sndbuf+0x509/0x8d0 net/sctp/socket.c:8042 >> > >> > stack backtrace: >> > CPU: 0 PID: 3505 Comm: syzkaller065230 Not tainted 4.15.0-rc6+ #250 >> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS >> > Google 01/01/2011 >> > Call Trace: >> > __dump_stack lib/dump_stack.c:17 [inline] >> > dump_stack+0x194/0x257 lib/dump_stack.c:53 >> > print_freed_lock_bug kernel/locking/lockdep.c:4379 [inline] >> > debug_check_no_locks_freed+0x32f/0x3c0 kernel/locking/lockdep.c:4412 >> > kmem_cache_free+0x68/0x2a0 mm/slab.c:3743 >> > sk_prot_free net/core/sock.c:1504 [inline] >> > __sk_destruct+0x622/0x910 net/core/sock.c:1585 >> > sk_destruct+0x47/0x80 net/core/sock.c:1593 >> > __sk_free+0x57/0x230 net/core/sock.c:1601 >> > sk_free+0x2a/0x40 net/core/sock.c:1612 >> > sock_put include/net/sock.h:1656 [inline] >> > sctp_association_destroy net/sctp/associola.c:424 [inline] >> > sctp_association_put+0x14c/0x2f0 net/sctp/associola.c:883 >> > sctp_wait_for_sndbuf+0x673/0x8d0 net/sctp/socket.c:8053 >> > sctp_sendmsg+0x277d/0x3360 net/sctp/socket.c:1974 >> > inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763 >> > sock_sendmsg_nosec net/socket.c:636 [inline] >> > sock_sendmsg+0xca/0x110 net/socket.c:646 >> > SYSC_sendto+0x361/0x5c0 net/socket.c:1727 >> > SyS_sendto+0x40/0x50 net/socket.c:1695 >> > entry_SYSCALL_64_fastpath+0x23/0x9a >> > RIP: 0033:0x445db9 >> > RSP: 002b:7f2467415d98 EFLAGS: 0212 ORIG_RAX: 002c >> > RAX: ffda RBX: 006dbc84 RCX: 00445db9 >> > RDX: 0001 RSI: 2010bf14 RDI: 0005 >> > RBP: R08: 204d9000 R09: 001c >> > R10: R11: 0212 R12: 006dbc80 >> > R13: 209a9000 R14: 0100 R15: 0001 >> > == >> > BUG: KASAN: use-after-free in debug_spin_lock_before >> > kernel/locking/spinlock_debug.c:83 [inline] >> > BUG: KASAN: use-after-free in do_raw_spin_lock+0x1e0/0x220 >> > kernel/locking/spinlock_debug.c:112 >> > Read of size 4 at addr 8801c17ab08c by task syzkaller065230/3505 >> >> It seems this commit is buggy: >> commit cea0cc80a6777beb6eb643d4ad53690e1ad1d4ff >> Author: Xin Long >> Date: Wed Nov 15 16:57:26 2017 +0800 >> >> sctp: use the right sk after waking up from wait_buf sleep >> >> I already had a patch to drop it and fix the old issue with a better way. > > Maybe you can quickly share it, so we have an idea what you are > talking about? Sure, in sctp_wait_for_sndbuf(). lock_sock(sk); - if (sk != asoc->base.sk) { - release_sock(sk); - sk = asoc->base.sk; - lock_sock(sk); - } + if (sk != asoc->base.sk) + goto do_error; I had this patch when doing cleanup in sctp_sendmsg, it will also make that cleanup easier. Some comments for it: After commit cea0cc80a677 ("sctp: use the right sk after waking up from wait_buf sleep"), it may change to lock another sk if the asoc has been peeled off in sctp_wait_for_sndbuf. However, the asoc's new sk could be already closed elsewhere, as it's in the sendmsg context of the old sk that can't avoid the new sk's closing.
[PATCH v2 bpf] bpf: introduce BPF_JIT_ALWAYS_ON config
The BPF interpreter has been used as part of the spectre 2 attack CVE-2017-5715. A quote from goolge project zero blog: "At this point, it would normally be necessary to locate gadgets in the host kernel code that can be used to actually leak data by reading from an attacker-controlled location, shifting and masking the result appropriately and then using the result of that as offset to an attacker-controlled address for a load. But piecing gadgets together and figuring out which ones work in a speculation context seems annoying. So instead, we decided to use the eBPF interpreter, which is built into the host kernel - while there is no legitimate way to invoke it from inside a VM, the presence of the code in the host kernel's text section is sufficient to make it usable for the attack, just like with ordinary ROP gadgets." To make attacker job harder introduce BPF_JIT_ALWAYS_ON config option that removes interpreter from the kernel in favor of JIT-only mode. So far eBPF JIT is supported by: x64, arm64, arm32, sparc64, s390, powerpc64, mips64 The start of JITed program is randomized and code page is marked as read-only. In addition "constant blinding" can be turned on with net.core.bpf_jit_harden v1->v2: - fix init order, test_bpf and cBPF (Daniel's feedback) - fix offloaded bpf (Jakub's feedback) - add 'return 0' dummy in case something can invoke prog->bpf_func - retarget bpf tree. For bpf-next the patch would need one extra hunk. It will be sent when the trees are merged back to net-next Considered doing: int bpf_jit_enable __read_mostly = BPF_EBPF_JIT_DEFAULT; but it seems better to land the patch as-is and in bpf-next remove bpf_jit_enable global variable from all JITs, consolidate in one place and remove this jit_init() function. Signed-off-by: Alexei Starovoitov --- init/Kconfig | 7 +++ kernel/bpf/core.c | 19 +++ lib/test_bpf.c | 11 +++ net/core/filter.c | 6 ++ net/core/sysctl_net_core.c | 6 ++ net/socket.c | 9 + 6 files changed, 50 insertions(+), 8 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 2934249fba46..5e2a4a391ba9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1392,6 +1392,13 @@ config BPF_SYSCALL Enable the bpf() system call that allows to manipulate eBPF programs and maps via file descriptors. +config BPF_JIT_ALWAYS_ON + bool "Permanently enable BPF JIT and remove BPF interpreter" + depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT + help + Enables BPF JIT and removes BPF interpreter to avoid + speculative execution of BPF instructions by the interpreter + config USERFAULTFD bool "Enable userfaultfd() system call" select ANON_INODES diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 86b50aa26ee8..b529982c3126 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +#ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on @@ -1317,6 +1318,8 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; +#endif + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -1354,6 +1357,12 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) return 0; } +static unsigned int __bpf_prog_ret0(const void *ctx, + const struct bpf_insn *insn) +{ + return 0; +} + /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with internal BPF program @@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { +#ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else + fp->bpf_func = __bpf_prog_ret0; +#endif /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) */ if (!bpf_prog_is_dev_bound(fp->aux)) { fp = bpf_int_jit_compile(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) { + *err = -ENOTSUPP; + return fp; + } +#endif } else { *err = bpf_prog_offload_compile(fp); if (*err) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 9e9748089270..f369889e521d 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -6250,9 +6250,8 @@ stati
Re: [PATCH 16/18] net: mpls: prevent bounds-check bypass via speculative execution
On Mon, Jan 8, 2018 at 8:13 PM, Linus Torvalds wrote: > > # carry will be clear if idx >= max > cmpq %idx,%max Bah. Other way around. cmpq %max,%idx I'm a moron. > # mask will be clear if carry was clear, ~0 otherwise > sbbq %mask,%mask > > to generate mask directly. I might have screwed that up. Worth perhaps trying? More importantly, worth _testing_ and fixing my hand-waving "asm like this" crap. But I do think that simple two-instruction cmpq/sbbq sequence could get it right in just two trivial ALU instructions. Linus
Re: [PATCH 16/18] net: mpls: prevent bounds-check bypass via speculative execution
On Mon, Jan 8, 2018 at 7:42 PM, Dan Williams wrote: > > originally from Linus and tweaked by Alexei and I: Sadly, that tweak - while clever - is wrong. > unsigned long _mask = ~(long)(_m - 1 - _i) >> BITS_PER_LONG - 1;\ Why? Because "(long)(_m-1-_i)" is not negative just because "i >= m". It can still be positive. Think "m = 100", "i=bignum". The subtraction will overflow and become positive again, the shift will shift to zero, and then the mask will become ~0. Now, you can fix it, but you need to be a tiny bit more clever. In particular, just make sure that you retain the high bit of "_i", basically making the rule be that a negative index is not ever valid. And then instead of "(_m - 1 - _i)", you use "(_i | (_m - 1 - _i))". Now the sign bit is set if _i had it set, _or_ if the subtraction turned negative, and you don't have to worry about the overflow situation. But it does require that extra step to be trustworthy. Still purely cheap arithmetic operations, although there is possibly some additional register pressure there. Somebody might be able to come up with something even more minimal (or find a fault in my fix of your tweak). Obviously, with architecture-specific code, you may well be able to do better, using the carry flag of the subtraction. For example, on x86, I think you could do it with just two instructions: # carry will be clear if idx >= max cmpq %idx,%max # mask will be clear if carry was clear, ~0 otherwise sbbq %mask,%mask to generate mask directly. I might have screwed that up. Worth perhaps trying? Linus
Re: [PATCH 16/18] net: mpls: prevent bounds-check bypass via speculative execution
On Mon, Jan 8, 2018 at 7:11 PM, Eric W. Biederman wrote: > Dan Williams writes: > >> Static analysis reports that 'index' may be a user controlled value that >> is used as a data dependency reading 'rt' from the 'platform_label' >> array. In order to avoid potential leaks of kernel memory values, block >> speculative execution of the instruction stream that could issue further >> reads based on an invalid 'rt' value. > > > In detail. > a) This code is fast path packet forwarding code. Introducing an >unconditional pipeline stall is not ok. > >AKA either there is no speculation and so this is invulnerable >or there is speculation and you are creating an unconditional >pipeline stall here. > >My back of the napkin caluculations say that a pipeline stall >is about 20 cycles. Which is about the same length of time >as a modern cache miss. > >On a good day this code will perform with 0 cache misses. On a less >good day 1 cache miss. Which means you are quite possibly doubling >the runtime of mpls_forward. > > b) The array is dynamically allocated which should provide some >protection, as it will be more difficult to predict the address >of the array which is needed to craft an malicious userspace value. > > c) The code can be trivially modified to say: > > static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned > index) > { > struct mpls_route *rt = NULL; > > if (index < net->mpls.platform_labels) { > struct mpls_route __rcu **platform_label = > rcu_dereference(net->mpls.platform_label); > rt = rcu_dereference(platform_label[index & ((1 << 20) - 1)]); > } > return rt; > } > > AKA a static mask will ensure that there is not a primitive that can be > used to access all of memory. That is max a 1 cycle slowdown in the > code, which is a much better trade off. > > d) If we care more it is straight forward to modify >resize_platform_label_table() to ensure that the size of the array >is always a power of 2. > > e) The fact that a pointer is returned from the array and it is treated >like a pointer would seem to provide a defense against the >exfiltration technique of using the value read as an index into >a small array, that user space code can probe aliased cached >lines of, to see which value was dereferenced. > > > So to this patch in particular. > Nacked-by: "Eric W. Biederman" > > This code path will be difficult to exploit. This change messes with > performance. There are ways to make this code path useless while > preserving the performance of the code. > Thanks, Eric understood. The discussion over the weekend came to the conclusion that using a mask will be the default approach. The nospec_array_ptr() will be defined to something similar to the following, originally from Linus and tweaked by Alexei and I: #define __nospec_array_ptr(base, idx, sz) \ ({ \ union { typeof(&base[0]) _ptr; unsigned long _bit; } __u; \ unsigned long _i = (idx); \ unsigned long _m = (max); \ unsigned long _mask = ~(long)(_m - 1 - _i) >> BITS_PER_LONG - 1;\ OPTIMIZER_HIDE_VAR(_mask); \ __u._ptr = &base[_i & _mask]; \ __u._bit &= _mask; \ __u._ptr; \ }) Does that address your performance concerns?
Re: [v2] net: gianfar_ptp: move set_fipers() to spinlock protecting area
On Tue, Jan 09, 2018 at 11:02:33AM +0800, Yangbo Lu wrote: > set_fipers() calling should be protected by spinlock in > case that any interrupt breaks related registers setting > and the function we expect. This patch is to move set_fipers() > to spinlock protecting area in ptp_gianfar_adjtime(). > > Signed-off-by: Yangbo Lu Acked-by: Richard Cochran
Re: [PATCH] net: gianfar_ptp: move set_fipers() to spinlock protecting area
On Mon, Jan 08, 2018 at 10:53:40AM -0200, Fabio Estevam wrote: > On Mon, Jan 8, 2018 at 8:13 AM, Yangbo Lu wrote: > > set_fipers() calling should be protected by spinlock. > > This patch is to move set_fipers() to spinlock protecting > > area in ptp_gianfar_adjtime() function. > > It would be nice to explay why. Maybe this is important? /* Caller must hold etsects->lock. */ static void set_fipers(struct etsects *etsects) { set_alarm(etsects); gfar_write(&etsects->regs->tmr_fiper1, etsects->tmr_fiper1); gfar_write(&etsects->regs->tmr_fiper2, etsects->tmr_fiper2); } Thanks, Richard
Re: [PATCH net-next 12/20] net: hns3: Add packet statistics of netdev
On 2018/1/9 11:06, David Miller wrote: From: "lipeng (Y)" Date: Tue, 9 Jan 2018 10:48:04 +0800 So I think it is OK if you can revert [patch 12/20 ]("net: hns3: Add packet statistics of netdev"). I think it is OK if you send the revert patch, which is what I am asking for :-) . sure, i will send the revert patch. I have tested it in my local branch. Thanks Peng Li
RE: [PATCH] net: gianfar_ptp: move set_fipers() to spinlock protecting area
> -Original Message- > From: Fabio Estevam [mailto:feste...@gmail.com] > Sent: 2018年1月8日 20:54 > To: Y.b. Lu > Cc: Claudiu Manoil ; Richard Cochran > ; netdev@vger.kernel.org; linux-kernel > > Subject: Re: [PATCH] net: gianfar_ptp: move set_fipers() to spinlock > protecting > area > > On Mon, Jan 8, 2018 at 8:13 AM, Yangbo Lu wrote: > > set_fipers() calling should be protected by spinlock. > > This patch is to move set_fipers() to spinlock protecting area in > > ptp_gianfar_adjtime() function. > > It would be nice to explay why. [Y.b. Lu] Sent out v2 patch and explained in commit message :) Thanks. > > Thanks
[v2] net: gianfar_ptp: move set_fipers() to spinlock protecting area
set_fipers() calling should be protected by spinlock in case that any interrupt breaks related registers setting and the function we expect. This patch is to move set_fipers() to spinlock protecting area in ptp_gianfar_adjtime(). Signed-off-by: Yangbo Lu --- Changes for v2: - explained why spinlock was needed in commit message. --- drivers/net/ethernet/freescale/gianfar_ptp.c |3 +-- 1 files changed, 1 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/gianfar_ptp.c b/drivers/net/ethernet/freescale/gianfar_ptp.c index 5441142..9f8d4f8 100644 --- a/drivers/net/ethernet/freescale/gianfar_ptp.c +++ b/drivers/net/ethernet/freescale/gianfar_ptp.c @@ -319,11 +319,10 @@ static int ptp_gianfar_adjtime(struct ptp_clock_info *ptp, s64 delta) now = tmr_cnt_read(etsects); now += delta; tmr_cnt_write(etsects, now); + set_fipers(etsects); spin_unlock_irqrestore(&etsects->lock, flags); - set_fipers(etsects); - return 0; } -- 1.7.1
[GIT] Networking
Highlights: 1) Frag and UDP handling fixes in i40e driver, from Amritha Nambiar and Alexander Duyck. 2) Undo unintentional UAPI change in netfilter conntrack, from Florian Westphal. 3) Revert a change to how error codes are returned from dev_get_valid_name(), it broke some apps. 4) Cannot cache routes for ipv6 tunnels in the tunnel is ipv4/ipv6 dual-stack. From Eli Cooper. 5) Fix missed PMTU updates in geneve, from Xin Long. 6) Cure double free in macvlan, from Gao Feng. 7) Fix heap out-of-bounds write in rds_message_alloc_sgs(), from Mohamed Ghannam. 8) FEC bug fixes from FUgang Duan (mis-accounting of dev_id, missed deferral of probe when the regulator is not ready yet). 9) Missing DMA mapping error checks in 3c59x, from Neil Horman. 10) Turn off Broadcom tags for some b53 switches, from Florian Fainelli. 11) Fix OOPS when get_target_net() is passed an SKB whose NETLINK_CB() isn't initialized. From Andrei Vagin. 12) Fix crashes in fib6_add(), from Wei Wang. 13) PMTU bug fixes in SCTP from Marcelo Ricardo Leitner. Please pull, thanks a lot! The following changes since commit 2758b3e3e630ba304fc4aca434d591e70e528298: Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2017-12-28 23:20:21 -0800) are available in the Git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git for you to fetch changes up to 50f3d740d376f664f6accc7e86c9afd8f1c7e1e4: sh_eth: fix TXALCR1 offsets (2018-01-08 14:31:38 -0500) Alexander Duyck (1): i40e/i40evf: Account for frags split over multiple descriptors in check linearize Amritha Nambiar (1): i40e: Remove UDP support for big buffer Andrei Vagin (1): rtnetlink: give a user socket to get_target_net() Arjun Vynipadath (1): cxgb4: Fix FW flash errors Benjamin Poirier (1): e1000e: Fix e1000_check_for_copper_link_ich8lan return value. Christophe JAILLET (1): mdio-sun4i: Fix a memory leak David S. Miller (10): Revert "net: core: dev_get_valid_name is now the same as dev_alloc_name_ns" Merge branch '1GbE' of git://git.kernel.org/.../jkirsher/net-queue Merge branch 'fec-clean-up-in-the-cases-of-probe-error' Merge branch 'ena-fixes' Merge branch '40GbE' of git://git.kernel.org/.../jkirsher/net-queue Merge tag 'mac80211-for-davem-2018-01-04' of git://git.kernel.org/.../jberg/mac80211 Merge git://git.kernel.org/.../pablo/nf Merge tag 'linux-can-fixes-for-4.15-20180104' of git://git.kernel.org/.../mkl/linux-can Merge branch 'bnxt_en_fixes' Merge branch 'SCTP-PMTU-discovery-fixes' Eduardo Otubo (1): xen-netfront: enable device after manual module load Eli Cooper (1): ip6_tunnel: disable dst caching if tunnel is dual-stack Felix Janda (1): uapi libc compat: add fallback for unsupported libcs Florian Fainelli (1): net: dsa: b53: Turn off Broadcom tags for more switches Florian Westphal (1): netfilter: uapi: correct UNTRACKED conntrack state bit number Fugang Duan (3): net: fec: restore dev_id in the cases of probe error net: fec: defer probe if regulator is not ready net: fec: free/restore resource in related probe error pathes Gao Feng (1): macvlan: Fix one possible double free Gustavo A. R. Silva (1): phylink: mark expected switch fall-throughs in phylink_mii_ioctl Hangbin Liu (1): netfilter: nf_tables: fix potential NULL-ptr deref in nf_tables_dump_obj_done() Hao Chen (1): nl80211: Check for the required netlink attribute presence Hauke Mehrtens (1): uapi/if_ether.h: prevent redefinition of struct ethhdr Ido Schimmel (2): mlxsw: spectrum_router: Fix NULL pointer deref mlxsw: spectrum: Relax sanity checks during enslavement Jacob Keller (1): i40e: don't remove netdev->dev_addr when syncing uc list Jerome Brunet (1): net: stmmac: enable EEE in MII, GMII or RGMII only Jiri Pirko (1): i40e: flower: Fix return value for unsupported offload Johannes Berg (1): mac80211: mesh: drop frames appearing to be from us Jon Maloy (1): tipc: fix problems with multipoint-to-point flow control Luu An Phu (1): can: flex_can: Correct the checking for frame length in flexcan_start_xmit() Marcelo Ricardo Leitner (3): sctp: fix error path in sctp_stream_init sctp: do not retransmit upon FragNeeded if PMTU discovery is disabled sctp: fix the handling of ICMP Frag Needed for too small MTUs Martin Lederhilger (1): can: ems_usb: improve error reporting for error warning and error passive Mohamed Ghannam (2): RDS: Heap OOB write in rds_message_alloc_sgs() RDS: null pointer dereference in rds_atomic_free_op Neil Horman (1): 3c59x: fix missing dma_mapping_error check and bad ring refill logic Netanel Belgazal (2): net: ena: unmask MSI-X only after device initialization i
Re: [PATCH 16/18] net: mpls: prevent bounds-check bypass via speculative execution
Dan Williams writes: > Static analysis reports that 'index' may be a user controlled value that > is used as a data dependency reading 'rt' from the 'platform_label' > array. In order to avoid potential leaks of kernel memory values, block > speculative execution of the instruction stream that could issue further > reads based on an invalid 'rt' value. In detail. a) This code is fast path packet forwarding code. Introducing an unconditional pipeline stall is not ok. AKA either there is no speculation and so this is invulnerable or there is speculation and you are creating an unconditional pipeline stall here. My back of the napkin caluculations say that a pipeline stall is about 20 cycles. Which is about the same length of time as a modern cache miss. On a good day this code will perform with 0 cache misses. On a less good day 1 cache miss. Which means you are quite possibly doubling the runtime of mpls_forward. b) The array is dynamically allocated which should provide some protection, as it will be more difficult to predict the address of the array which is needed to craft an malicious userspace value. c) The code can be trivially modified to say: static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) { struct mpls_route *rt = NULL; if (index < net->mpls.platform_labels) { struct mpls_route __rcu **platform_label = rcu_dereference(net->mpls.platform_label); rt = rcu_dereference(platform_label[index & ((1 << 20) - 1)]); } return rt; } AKA a static mask will ensure that there is not a primitive that can be used to access all of memory. That is max a 1 cycle slowdown in the code, which is a much better trade off. d) If we care more it is straight forward to modify resize_platform_label_table() to ensure that the size of the array is always a power of 2. e) The fact that a pointer is returned from the array and it is treated like a pointer would seem to provide a defense against the exfiltration technique of using the value read as an index into a small array, that user space code can probe aliased cached lines of, to see which value was dereferenced. So to this patch in particular. Nacked-by: "Eric W. Biederman" This code path will be difficult to exploit. This change messes with performance. There are ways to make this code path useless while preserving the performance of the code. Eric > > Based on an original patch by Elena Reshetova. > > Cc: "David S. Miller" > Cc: Eric W. Biederman > Cc: netdev@vger.kernel.org > Signed-off-by: Elena Reshetova > Signed-off-by: Dan Williams > --- > net/mpls/af_mpls.c | 12 +++- > 1 file changed, 7 insertions(+), 5 deletions(-) > > diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c > index 8ca9915befc8..ebcf0e246cfe 100644 > --- a/net/mpls/af_mpls.c > +++ b/net/mpls/af_mpls.c > @@ -8,6 +8,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -77,12 +78,13 @@ static void rtmsg_lfib(int event, u32 label, struct > mpls_route *rt, > static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned > index) > { > struct mpls_route *rt = NULL; > + struct mpls_route __rcu **platform_label = > + rcu_dereference(net->mpls.platform_label); > + struct mpls_route __rcu **rtp; > > - if (index < net->mpls.platform_labels) { > - struct mpls_route __rcu **platform_label = > - rcu_dereference(net->mpls.platform_label); > - rt = rcu_dereference(platform_label[index]); > - } > + if ((rtp = nospec_array_ptr(platform_label, index, > + net->mpls.platform_labels))) > + rt = rcu_dereference(*rtp); > return rt; > } >
Re: [PATCH net-next 12/20] net: hns3: Add packet statistics of netdev
From: "lipeng (Y)" Date: Tue, 9 Jan 2018 10:48:04 +0800 > So I think it is OK if you can revert [patch 12/20 ]("net: hns3: Add > packet statistics of netdev"). I think it is OK if you send the revert patch, which is what I am asking for :-)
Re: [PATCH net-next v2 1/6] net: Fix netdev_WARN_ONCE macro
From: Joe Perches Date: Mon, 08 Jan 2018 18:42:01 -0800 > On Sun, 2018-01-07 at 12:08 +0200, Gal Pressman wrote: >> netdev_WARN_ONCE is broken (whoops..), this fix will remove the >> unnecessary "condition" parameter, add the missing comma and change >> "arg" to "args". >> >> Fixes: 375ef2b1f0d0 ("net: Introduce netdev_*_once functions") >> Signed-off-by: Gal Pressman >> Reviewed-by: Saeed Mahameed >> --- >> include/linux/netdevice.h | 4 ++-- >> 1 file changed, 2 insertions(+), 2 deletions(-) >> >> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h >> index 352066e..5ff1ef9 100644 >> --- a/include/linux/netdevice.h >> +++ b/include/linux/netdevice.h >> @@ -4407,8 +4407,8 @@ do { >> \ >> WARN(1, "netdevice: %s%s\n" format, netdev_name(dev), \ >> netdev_reg_state(dev), ##args) >> >> -#define netdev_WARN_ONCE(dev, condition, format, arg...)\ >> -WARN_ONCE(1, "netdevice: %s%s\n" format, netdev_name(dev) \ >> +#define netdev_WARN_ONCE(dev, format, args...) >> \ >> +WARN_ONCE(1, "netdevice: %s%s\n" format, netdev_name(dev), \ > > You sure you want the newline before the format? Hmmm, Gal please send me a relative fix for this.
Re: [PATCH v2] openvswitch: Trim off padding before L3+ netfilter processing
On 1/6/18 10:57, Pravin Shelar wrote: > On Fri, Jan 5, 2018 at 10:59 PM, Ed Swierk wrote: >> >> >> On Jan 5, 2018 22:17, "Pravin Shelar" wrote: >> >> On Fri, Jan 5, 2018 at 3:20 PM, Ed Swierk >> wrote: >>> On Fri, Jan 5, 2018 at 10:14 AM, Ed Swierk >>> wrote: On Thu, Jan 4, 2018 at 7:36 PM, Pravin Shelar wrote: > OVS already pull all required headers in skb linear data, so no need > to redo all of it. only check required is the ip-checksum validation. > I think we could avoid it in most of cases by checking skb length to > ipheader length before verifying the ip header-checksum. Shouldn't the IP header checksum be verified even earlier, like in key_extract(), before actually using any of the fields in the IP header? >>> >>> Something like this for verifying the IP header checksum (not tested): >>> >> AFAIU openflow does not need this verification, so it is not required >> in flow extract. >> >> >> Okay. How about my proposed trimming implementation, caching the pad length >> in the ovs cb? >> > Caching the length is not that simple, OVS actions can change the > length. Keeping it consistent with packet would be more work, so lets > calculate it in ovs-ct function. > Something like this? diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a38c80e..282325d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4084,6 +4084,8 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, unsigned int transport_len, __sum16(*skb_chkf)(struct sk_buff *skb)); +int skb_network_trim(struct sk_buff *skb); + /** * skb_head_is_locked - Determine if the skb->head is locked down * @skb: skb to check diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 08f5740..c68e927 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4740,6 +4740,41 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, } EXPORT_SYMBOL(skb_checksum_trimmed); +/** + * skb_network_trim - trim skb to length specified by the network header + * @skb: the skb to trim + * + * Trims the skb to the length specified by the network header, + * removing any trailing padding. Leaves the skb alone if the protocol + * is not IP or IPv6. Frees the skb on error. + * + * Caller needs to pull the skb to the network header. + */ +int skb_network_trim(struct sk_buff *skb) +{ + unsigned int len; + int err; + + switch (skb->protocol) { + case htons(ETH_P_IP): + len = ntohs(ip_hdr(skb)->tot_len); + break; + case htons(ETH_P_IPV6): + len = sizeof(struct ipv6hdr) + + ntohs(ipv6_hdr(skb)->payload_len); + break; + default: + len = skb->len; + } + + err = pskb_trim_rcsum(skb, len); + if (unlikely(err)) + kfree_skb(skb); + + return err; +} +EXPORT_SYMBOL(skb_network_trim); + void __skb_warn_lro_forwarding(const struct sk_buff *skb) { net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index b27c5c6..73418d3 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1112,6 +1112,10 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, nh_ofs = skb_network_offset(skb); skb_pull_rcsum(skb, nh_ofs); + err = skb_network_trim(skb); + if (err) + return err; + if (key->ip.frag != OVS_FRAG_TYPE_NONE) { err = handle_fragments(net, key, info->zone.id, skb); if (err)
Re: [PATCH net-next 12/20] net: hns3: Add packet statistics of netdev
On 2018/1/9 9:54, David Miller wrote: From: Jakub Kicinski Date: Mon, 8 Jan 2018 17:50:21 -0800 Oh, I only noticed this extra misleading comment now. Unless each queue has a netdev, I don't see how these are per-queue. If it isn't per-queue I want this change reverted. [patch 12/20 ] add statistics of netdev for ethtool -S, netdev may have multi queue. As discussion here, it is duplicate to add this patch. I revert [patch 12/20 ] , and then test on my board, HNS3 basic function and ethtool -S work well. So I think it is OK if you can revert [patch 12/20 ]("net: hns3: Add packet statistics of netdev"). Thanks Peng Li .
Re: [PATCH net-next v2 1/6] net: Fix netdev_WARN_ONCE macro
On Sun, 2018-01-07 at 12:08 +0200, Gal Pressman wrote: > netdev_WARN_ONCE is broken (whoops..), this fix will remove the > unnecessary "condition" parameter, add the missing comma and change > "arg" to "args". > > Fixes: 375ef2b1f0d0 ("net: Introduce netdev_*_once functions") > Signed-off-by: Gal Pressman > Reviewed-by: Saeed Mahameed > --- > include/linux/netdevice.h | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 352066e..5ff1ef9 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -4407,8 +4407,8 @@ do { > \ > WARN(1, "netdevice: %s%s\n" format, netdev_name(dev), \ >netdev_reg_state(dev), ##args) > > -#define netdev_WARN_ONCE(dev, condition, format, arg...) \ > - WARN_ONCE(1, "netdevice: %s%s\n" format, netdev_name(dev) \ > +#define netdev_WARN_ONCE(dev, format, args...) > \ > + WARN_ONCE(1, "netdevice: %s%s\n" format, netdev_name(dev), \ You sure you want the newline before the format? > netdev_reg_state(dev), ##args) > > /* netif printk helpers, similar to netdev_printk */
RE: [PATCH net-next v2 5/6] bnx2x: Replace WARN_ONCE with netdev_WARN_ONCE
-Original Message- From: netdev-ow...@vger.kernel.org [mailto:netdev-ow...@vger.kernel.org] On Behalf Of Gal Pressman Sent: 07 January 2018 15:39 To: David S. Miller Cc: netdev@vger.kernel.org; Tariq Toukan ; Saeed Mahameed ; Gal Pressman ; Elior, Ariel Subject: [PATCH net-next v2 5/6] bnx2x: Replace WARN_ONCE with netdev_WARN_ONCE Use the more appropriate netdev_WARN_ONCE instead of WARN_ONCE macro. Signed-off-by: Gal Pressman Reviewed-by: Saeed Mahameed Cc: Ariel Elior --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 57eb26d..d7c98e8 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -738,8 +738,9 @@ static void bnx2x_gro_receive(struct bnx2x *bp, struct bnx2x_fastpath *fp, bnx2x_gro_csum(bp, skb, bnx2x_gro_ipv6_csum); break; default: - WARN_ONCE(1, "Error: FW GRO supports only IPv4/IPv6, not 0x%04x\n", - be16_to_cpu(skb->protocol)); + netdev_WARN_ONCE(bp->dev, +"Error: FW GRO supports only IPv4/IPv6, not 0x%04x\n", +be16_to_cpu(skb->protocol)); } } #endif -- 2.7.4 Acked-by: Sudarsana Kalluru
RE: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode
> -Original Message- > From: n...@orbyte.nwl.cc [mailto:n...@orbyte.nwl.cc] On Behalf Of Phil > Sutter > Sent: Monday, January 8, 2018 9:32 PM > To: Chris Mi > Cc: dsah...@gmail.com; marcelo.leit...@gmail.com; > netdev@vger.kernel.org; gerlitz...@gmail.com; > step...@networkplumber.org > Subject: Re: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode > > Hi Chris, > > On Mon, Jan 08, 2018 at 02:03:53AM +, Chris Mi wrote: > > > On Thu, Jan 04, 2018 at 04:34:51PM +0900, Chris Mi wrote: > > > > The insertion rate is improved more than 10%. > > > > > > Did you measure the effect of increasing batch sizes? > > Yes. Even if we enlarge the batch size bigger than 10, there is no big > improvement. > > I think that's because current kernel doesn't process the requests in > parallel. > > If kernel processes the requests in parallel, I believe specifying a > > bigger batch size will get a better result. > > But throughput doesn't regress at some point, right? I think that's the > critical > aspect when considering an "unlimited" batch size. > > On Mon, Jan 08, 2018 at 08:00:00AM +, Chris Mi wrote: > > After testing, I find that the message passed to kernel should not be too > big. > > If it is bigger than about 64K, sendmsg returns -1, errno is 90 (EMSGSIZE). > > That is about 400 commands. So how about set batch size to 128 which is > big enough? > > If that's the easiest way, why not. At first, I thought one could maybe send > the collected messages in chunks of suitable size, but that's probably not > worth the effort. I did a testing. If we read a million commands in memory and send them in chunks of 128, we'll have a big regression. It takes about 21 seconds.
Re: [PATCH bpf-next] bpf: introduce BPF_JIT_ALWAYS_ON config
On 1/8/18 4:02 PM, Jakub Kicinski wrote: On Mon, 8 Jan 2018 22:59:04 +0100, Daniel Borkmann wrote: @@ -1453,6 +1457,11 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) */ *err = bpf_check_tail_call(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) + *err = -ENOTSUPP; +#endif I think programs JITed for offload won't have fp->jited set, but those are pretty safe from CPU bugs. Should we set fp->jited = 1; in bpf_prog_offload_compile()? Just throwing "&& !bpf_prog_is_dev_bound()" in here seems cleaner to me. good catch. will fix in the v2.
Re: [PATCH net-next 1/3] ethtool: Ensure new ring parameters are within bounds during SRINGPARAM
On Mon, 8 Jan 2018 16:00:24 +0200, Tariq Toukan wrote: > From: Eugenia Emantayev > > Add a sanity check to ensure that all requested ring parameters > are within bounds, which should reduce errors in driver implementation. (y) > Signed-off-by: Eugenia Emantayev > Signed-off-by: Tariq Toukan > --- > net/core/ethtool.c | 13 +++-- > 1 file changed, 11 insertions(+), 2 deletions(-) > > diff --git a/net/core/ethtool.c b/net/core/ethtool.c > index 50a79203043b..9ea7cd52fde0 100644 > --- a/net/core/ethtool.c > +++ b/net/core/ethtool.c > @@ -1704,14 +1704,23 @@ static int ethtool_get_ringparam(struct net_device > *dev, void __user *useraddr) > > static int ethtool_set_ringparam(struct net_device *dev, void __user > *useraddr) > { > - struct ethtool_ringparam ringparam; > + struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; > > - if (!dev->ethtool_ops->set_ringparam) > + if (!dev->ethtool_ops->set_ringparam || > !dev->ethtool_ops->get_ringparam) > return -EOPNOTSUPP; > > if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) > return -EFAULT; > > + dev->ethtool_ops->get_ringparam(dev, &max); Perhaps check the return value here? It's pretty unlikely but get_ringparam may fail. > + /* ensure new ring parameters are within the maximums */ > + if (ringparam.rx_pending > max.rx_max_pending || > + ringparam.rx_mini_pending > max.rx_mini_max_pending || > + ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending || > + ringparam.tx_pending > max.tx_max_pending) > + return -EINVAL; > + > return dev->ethtool_ops->set_ringparam(dev, &ringparam); > } >
Re: [PATCH 07/18] [media] uvcvideo: prevent bounds-check bypass via speculative execution
On Mon, Jan 8, 2018 at 3:23 AM, Laurent Pinchart wrote: > Hi Dan, > > Thank you for the patch. > > On Saturday, 6 January 2018 03:10:32 EET Dan Williams wrote: >> Static analysis reports that 'index' may be a user controlled value that >> is used as a data dependency to read 'pin' from the >> 'selector->baSourceID' array. In order to avoid potential leaks of >> kernel memory values, block speculative execution of the instruction >> stream that could issue reads based on an invalid value of 'pin'. > > I won't repeat the arguments already made in the thread regarding having > documented coverity rules for this, even if I agree with them. > >> Based on an original patch by Elena Reshetova. >> >> Cc: Laurent Pinchart >> Cc: Mauro Carvalho Chehab >> Cc: linux-me...@vger.kernel.org >> Signed-off-by: Elena Reshetova >> Signed-off-by: Dan Williams >> --- >> drivers/media/usb/uvc/uvc_v4l2.c |7 +-- >> 1 file changed, 5 insertions(+), 2 deletions(-) >> >> diff --git a/drivers/media/usb/uvc/uvc_v4l2.c >> b/drivers/media/usb/uvc/uvc_v4l2.c index 3e7e283a44a8..7442626dc20e 100644 >> --- a/drivers/media/usb/uvc/uvc_v4l2.c >> +++ b/drivers/media/usb/uvc/uvc_v4l2.c >> @@ -22,6 +22,7 @@ >> #include >> #include >> #include >> +#include >> >> #include >> #include >> @@ -810,6 +811,7 @@ static int uvc_ioctl_enum_input(struct file *file, void >> *fh, struct uvc_entity *iterm = NULL; >> u32 index = input->index; >> int pin = 0; >> + __u8 *elem; >> >> if (selector == NULL || >> (chain->dev->quirks & UVC_QUIRK_IGNORE_SELECTOR_UNIT)) { >> @@ -820,8 +822,9 @@ static int uvc_ioctl_enum_input(struct file *file, void >> *fh, break; >> } >> pin = iterm->id; >> - } else if (index < selector->bNrInPins) { >> - pin = selector->baSourceID[index]; >> + } else if ((elem = nospec_array_ptr(selector->baSourceID, index, >> + selector->bNrInPins))) { >> + pin = *elem; >> list_for_each_entry(iterm, &chain->entities, chain) { >> if (!UVC_ENTITY_IS_ITERM(iterm)) >> continue; > > (adding a bit more context) > >> if (iterm->id == pin) >> break; >> } >> } >> >> if (iterm == NULL || iterm->id != pin) >> return -EINVAL; >> >> memset(input, 0, sizeof(*input)); >> input->index = index; >> strlcpy(input->name, iterm->name, sizeof(input->name)); >> if (UVC_ENTITY_TYPE(iterm) == UVC_ITT_CAMERA) >> input->type = V4L2_INPUT_TYPE_CAMERA; > > So pin is used to search for an entry in the chain->entities list. Entries in > that list are allocated separately through kmalloc and can thus end up in > different cache lines, so I agree we have an issue. However, this is mitigated > by the fact that typical UVC devices have a handful (sometimes up to a dozen) > entities, so an attacker would only be able to read memory values that are > equal to the entity IDs used by the device. Entity IDs can be freely allocated > but typically count continuously from 0. It would take a specially-crafted UVC > device to be able to read all memory. > > On the other hand, as this is nowhere close to being a fast path, I think we > can close this potential hole as proposed in the patch. So, > > Reviewed-by: Laurent Pinchart Thanks Laurent! > Will you merge the whole series in one go, or would you like me to take the > patch in my tree ? In the latter case I'll wait until the nospec_array_ptr() > gets merged in mainline. I'll track it for now. Until the 'nospec_array_ptr()' discussion resolves there won't be a stabilized commit-id for you to base a branch.
Re: Subject: [RFC][PATCH 04/11] stmmac: fix breakage in stmmac_hw_setup()
From: Al Viro Date: Fri, 05 Jan 2018 19:31:58 + > Since "drivers: net: stmmac: reworking the PCS code" ->pcs_ctrl_ane() > had been taking iomem address to access as the first argument; its > predecessor (->ctrl_ane()) used to take struct mac_device_info instead. > > One of the callers had not been converted; as the result, instead of > reading and modifying a word in card iomem we read and modify a word > in (or near) the in-core strucct mac_device_info. > > Fixes: 70523e639bf8 (drivers: net: stmmac: reworking the PCS code) > Cc: sta...@vger.kernel.org > Signed-off-by: Al Viro Yikes... Al, can you split the bug fixes like this one into a separate series for me to pull into my net GIT tree? Don't include the pure annotation or partial endianness conversion ones, those should go to net-next.
Re: [RESEND PATCH 2/3] net: ovs: remove unused hardirq.h
From: "Yang Shi" Date: Tue, 09 Jan 2018 03:52:53 +0800 > Preempt counter APIs have been split out, currently, hardirq.h just > includes irq_enter/exit APIs which are not used by openvswitch at all. > > So, remove the unused hardirq.h. > > Signed-off-by: Yang Shi > Acked-by: Pravin B Shelar Applied.
Re: [RESEND PATCH 1/3] net: caif: remove unused hardirq.h
From: "Yang Shi" Date: Tue, 09 Jan 2018 03:52:52 +0800 > Preempt counter APIs have been split out, currently, hardirq.h just > includes irq_enter/exit APIs which are not used by caif at all. > > So, remove the unused hardirq.h. > > Signed-off-by: Yang Shi Applied.
Re: [RESEND PATCH 3/3] net: tipc: remove unused hardirq.h
From: "Yang Shi" Date: Tue, 09 Jan 2018 03:52:54 +0800 > Preempt counter APIs have been split out, currently, hardirq.h just > includes irq_enter/exit APIs which are not used by TIPC at all. > > So, remove the unused hardirq.h. > > Signed-off-by: Yang Shi > Acked-by: Ying Xue > Tested-by: Ying Xue Applied.
Re: [PATCH 00/52] Netfilter/IPVS updates for net-next
From: Pablo Neira Ayuso Date: Mon, 8 Jan 2018 21:19:08 +0100 > The following patchset contains Netfilter/IPVS updates for your > net-next tree: ... > 4) Add generic flow table offload infrastructure for nf_tables, this >includes the netlink control plane and support for IPv4, IPv6 and >mixed IPv4/IPv6 dataplanes. This comes with NAT support too. This >patchset adds the IPS_OFFLOAD conntrack status bit to indicate that >this flow has been offloaded. Have driver maintainers signed off on your offload design and driver interfaces? I've pulled, but the above is really important to indicate when a new offload feature is added. Thanks.
Re: [PATCH net-next 12/20] net: hns3: Add packet statistics of netdev
From: Jakub Kicinski Date: Mon, 8 Jan 2018 17:50:21 -0800 > Oh, I only noticed this extra misleading comment now. Unless each queue > has a netdev, I don't see how these are per-queue. If it isn't per-queue I want this change reverted.
Re: [PATCH net-next v2 0/6] Replace WARN_ONCE usages with netdev_WARN_ONCE
From: Gal Pressman Date: Sun, 7 Jan 2018 12:08:34 +0200 > This series will fix an issue in netdev_WARN_ONCE, improve its formatting and > replace drivers' usage of WARN_ONCE to netdev_WARN_ONCE. > > Driver specific patches were compilation tested, in addition, functional > tested > on Mellanox NIC. > > v1->v2: > - Addressed commit message comments in patch #1 Series applied, thanks.
Re: [PATCH net-next 12/20] net: hns3: Add packet statistics of netdev
On Mon, 8 Jan 2018 17:46:02 -0800, Jakub Kicinski wrote: > On Mon, 08 Jan 2018 20:39:13 -0500 (EST), David Miller wrote: > > From: Jakub Kicinski > > Date: Mon, 8 Jan 2018 12:04:31 -0800 > > > > > Ugh, I so didn't review this in time :( I think there is a consensus > > > that we should avoid duplicating standard stats in ethtool. Especially > > > those old ones. Like "collisions", I assume this is a modern NIC, are > > > collisions still a thing? > > > > There is no standard way to get per-queue values, and ethtool stats are > > how pretty much every driver provides it. > > Right, agreed. I'm only objecting to this patch (12/20), where we can > see the telltale code like this: > > + const struct rtnl_link_stats64 *net_stats; > + struct rtnl_link_stats64 temp; > + > + net_stats = dev_get_stats(netdev, &temp); > + for (i = 0; i < HNS3_NETDEV_STATS_COUNT; i++) { > + stat = (u8 *)net_stats + hns3_netdev_stats[i].stats_offset; > + *data++ = *(u64 *)stat; > + } > > Where: > > +#define HNS3_NETDEV_STAT(_string, _member) { \ > + .stats_string = _string,\ > + .stats_offset = offsetof(struct rtnl_link_stats64, _member) \ > +} > + > +static const struct hns3_stats hns3_netdev_stats[] = { > + /* Rx per-queue statistics */ Oh, I only noticed this extra misleading comment now. Unless each queue has a netdev, I don't see how these are per-queue. > + HNS3_NETDEV_STAT("rx_packets", rx_packets), > + HNS3_NETDEV_STAT("tx_packets", tx_packets), > > etc. IOW dumping struct rtnl_link_stats64 to ethtool -S member by > member. > > Let me put the netlink per-queue stats on my soft TODO list :) >
RE: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode
> -Original Message- > From: Stephen Hemminger [mailto:step...@networkplumber.org] > Sent: Monday, January 8, 2018 11:40 PM > To: Chris Mi > Cc: David Ahern ; Phil Sutter ; > marcelo.leit...@gmail.com; netdev@vger.kernel.org; gerlitz...@gmail.com > Subject: Re: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode > > On Mon, 8 Jan 2018 08:00:00 + > Chris Mi wrote: > > > > >> I wonder whether specifying the batch size is necessary at all. > > > >> Couldn't batch mode just collect messages until either EOF or an > > > >> incompatible command is encountered which then triggers a commit > > > >> to kernel? This might simplify code quite a bit. > > > > That's a good suggestion. > > > > > > Thanks for your time on this, Chris. > > After testing, I find that the message passed to kernel should not be too > big. > > If it is bigger than about 64K, sendmsg returns -1, errno is 90 (EMSGSIZE). > > That is about 400 commands. So how about set batch size to 128 which is > big enough? > > > Use sendmmsg? Maybe we can try that, but there is also a limit on it.
Re: [PATCH net-next 12/20] net: hns3: Add packet statistics of netdev
On Mon, 08 Jan 2018 20:39:13 -0500 (EST), David Miller wrote: > From: Jakub Kicinski > Date: Mon, 8 Jan 2018 12:04:31 -0800 > > > Ugh, I so didn't review this in time :( I think there is a consensus > > that we should avoid duplicating standard stats in ethtool. Especially > > those old ones. Like "collisions", I assume this is a modern NIC, are > > collisions still a thing? > > There is no standard way to get per-queue values, and ethtool stats are > how pretty much every driver provides it. Right, agreed. I'm only objecting to this patch (12/20), where we can see the telltale code like this: + const struct rtnl_link_stats64 *net_stats; + struct rtnl_link_stats64 temp; + + net_stats = dev_get_stats(netdev, &temp); + for (i = 0; i < HNS3_NETDEV_STATS_COUNT; i++) { + stat = (u8 *)net_stats + hns3_netdev_stats[i].stats_offset; + *data++ = *(u64 *)stat; + } Where: +#define HNS3_NETDEV_STAT(_string, _member) { \ + .stats_string = _string,\ + .stats_offset = offsetof(struct rtnl_link_stats64, _member) \ +} + +static const struct hns3_stats hns3_netdev_stats[] = { + /* Rx per-queue statistics */ + HNS3_NETDEV_STAT("rx_packets", rx_packets), + HNS3_NETDEV_STAT("tx_packets", tx_packets), etc. IOW dumping struct rtnl_link_stats64 to ethtool -S member by member. Let me put the netlink per-queue stats on my soft TODO list :)
[PATCH] ath9k: add a quirk to set use_msi automatically
Some platform(BIOS) blocks legacy interrupts (INTx), and only allows MSI for WLAN device. So adding a quirk to list those machines and set use_msi automatically. Adding the following platforms to the quirk. Dell Inspiron 24-3460 Dell Inspiron 3472 Dell Inspiron 14-3473 Dell Vostro 3262 Dell Vostro 15-3572 Signed-off-by: AceLan Kao --- drivers/net/wireless/ath/ath9k/init.c | 53 +++ 1 file changed, 53 insertions(+) diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c index 43adead..e479fae 100644 --- a/drivers/net/wireless/ath/ath9k/init.c +++ b/drivers/net/wireless/ath/ath9k/init.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "ath9k.h" @@ -96,6 +97,56 @@ static const struct ieee80211_tpt_blink ath9k_tpt_blink[] = { }; #endif +static int __init set_use_msi(const struct dmi_system_id *dmi) +{ + ath9k_use_msi = 1; + return 1; +} + +static const struct dmi_system_id ath9k_quirks[] __initconst = { + { + .callback = set_use_msi, + .ident = "Dell Inspiron 24-3460", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 24-3460"), + }, + }, + { + .callback = set_use_msi, + .ident = "Dell Vostro 3262", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Vostro 3262"), + }, + }, + { + .callback = set_use_msi, + .ident = "Dell Inspiron 3472", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 3472"), + }, + }, + { + .callback = set_use_msi, + .ident = "Dell Vostro 15-3572", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Vostro 15-3572"), + }, + }, + { + .callback = set_use_msi, + .ident = "Dell Inspiron 14-3473", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 14-3473"), + }, + }, + {} +}; + static void ath9k_deinit_softc(struct ath_softc *sc); static void ath9k_op_ps_wakeup(struct ath_common *common) @@ -1104,6 +1155,8 @@ static int __init ath9k_init(void) goto err_pci_exit; } + dmi_check_system(ath9k_quirks); + return 0; err_pci_exit: -- 2.7.4
Re: [PATCH net-next 12/20] net: hns3: Add packet statistics of netdev
From: Jakub Kicinski Date: Mon, 8 Jan 2018 12:04:31 -0800 > Ugh, I so didn't review this in time :( I think there is a consensus > that we should avoid duplicating standard stats in ethtool. Especially > those old ones. Like "collisions", I assume this is a modern NIC, are > collisions still a thing? There is no standard way to get per-queue values, and ethtool stats are how pretty much every driver provides it.
Re: b43: Replace mdelay with msleep in b43_radio_2057_init_post
On 2018/1/9 0:31, Larry Finger wrote: On 01/08/2018 10:21 AM, Kalle Valo wrote: Jia-Ju Bai wrote: b43_radio_2057_init_post is not called in an interrupt handler nor holding a spinlock. The function mdelay in it can be replaced with msleep, to reduce busy wait. Signed-off-by: Jia-Ju Bai You submitted an identical patch a week earlier: https://patchwork.kernel.org/patch/10137671/ How is this different? Also always add version number to the patch so that the maintainers can follow the changes easily: https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches#patch_version_missing https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches#changelog_missing I had negative comments on one of those due to the possibility of msleep(2) extending as long as 20 msec. Until the author, or someone else, can test that this is OK, then the mdelay(2) can only be replaced with usleep_range(2000, 3000). NACK for both. Larry Sorry for my mistake. I have sent a patch v2 using usleep_range(2000, 3000), and you can have a look :) Thanks, Jia-Ju Bai
[PATCH v2] b43: Replace mdelay with usleep_range in b43_radio_2057_init_post
b43_radio_2057_init_post is not called in an interrupt handler nor holding a spinlock. The function mdelay in it can be replaced with usleep_range, to reduce busy wait. Signed-off-by: Jia-Ju Bai --- v2: * Replace mdelay with usleep_range, instead of msleep in v1. Thank Larry for good advice. --- drivers/net/wireless/broadcom/b43/phy_n.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/b43/phy_n.c b/drivers/net/wireless/broadcom/b43/phy_n.c index a5557d7..f2a2f41 100644 --- a/drivers/net/wireless/broadcom/b43/phy_n.c +++ b/drivers/net/wireless/broadcom/b43/phy_n.c @@ -1031,7 +1031,7 @@ static void b43_radio_2057_init_post(struct b43_wldev *dev) b43_radio_set(dev, R2057_RFPLL_MISC_CAL_RESETN, 0x78); b43_radio_set(dev, R2057_XTAL_CONFIG2, 0x80); - mdelay(2); + usleep_range(2000, 3000); b43_radio_mask(dev, R2057_RFPLL_MISC_CAL_RESETN, ~0x78); b43_radio_mask(dev, R2057_XTAL_CONFIG2, ~0x80); -- 1.7.9.5
Re: [PATCH net 0/3] Some sockopt optlen fixes
On Mon, Jan 08, 2018 at 07:02:26PM -0200, Marcelo Ricardo Leitner wrote: > Hangbin Liu reported that some SCTP sockopt are allowing the user to get > the kernel to allocate really large buffers by not having a ceiling on > optlen. > > This patchset address this issue (in patch 2), replace an GFP_ATOMIC > that isn't needed and avoid calculating the option size multiple times > in some setsockopt. > > Marcelo Ricardo Leitner (3): > sctp: GFP_ATOMIC is not needed in sctp_setsockopt_events > sctp: add a ceiling to optlen in some sockopts > sctp: make use of pre-calculated len > > net/sctp/socket.c | 28 +--- > 1 file changed, 21 insertions(+), 7 deletions(-) > > -- > 2.14.3 > > Series Acked-by: Neil Horman
RE: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode
> -Original Message- > From: n...@orbyte.nwl.cc [mailto:n...@orbyte.nwl.cc] On Behalf Of Phil > Sutter > Sent: Monday, January 8, 2018 9:32 PM > To: Chris Mi > Cc: dsah...@gmail.com; marcelo.leit...@gmail.com; > netdev@vger.kernel.org; gerlitz...@gmail.com; > step...@networkplumber.org > Subject: Re: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode > > Hi Chris, > > On Mon, Jan 08, 2018 at 02:03:53AM +, Chris Mi wrote: > > > On Thu, Jan 04, 2018 at 04:34:51PM +0900, Chris Mi wrote: > > > > The insertion rate is improved more than 10%. > > > > > > Did you measure the effect of increasing batch sizes? > > Yes. Even if we enlarge the batch size bigger than 10, there is no big > improvement. > > I think that's because current kernel doesn't process the requests in > parallel. > > If kernel processes the requests in parallel, I believe specifying a > > bigger batch size will get a better result. > > But throughput doesn't regress at some point, right? I think that's the > critical > aspect when considering an "unlimited" batch size. Yes. > > On Mon, Jan 08, 2018 at 08:00:00AM +, Chris Mi wrote: > > After testing, I find that the message passed to kernel should not be too > big. > > If it is bigger than about 64K, sendmsg returns -1, errno is 90 (EMSGSIZE). > > That is about 400 commands. So how about set batch size to 128 which is > big enough? > > If that's the easiest way, why not. At first, I thought one could maybe send > the collected messages in chunks of suitable size, but that's probably not > worth the effort. OK. -Chris
Re: [RFC PATCH bpf-next v2 0/4] Separate error injection table from kprobes
On Thu, 4 Jan 2018 11:07:16 -0500 Josef Bacik wrote: > On Tue, Dec 26, 2017 at 04:46:28PM +0900, Masami Hiramatsu wrote: > > Hi Josef and Alexei, > > > > Here are the 2nd version of patches to moving error injection > > table from kprobes. In this series I did a small fixes and > > add function-based fault injection. > > > > Here is the previous version: > > > > https://lkml.org/lkml/2017/12/22/554 > > > > There are 2 main reasons why I separate it from kprobes. > > > > - kprobes users can modify execution path not only at > >error-injection whitelist functions but also other > >functions. I don't like to suggest user that such > >limitation is from kprobes itself. > > > > - This error injection information is also useful for > >ftrace (function-hook) and livepatch. It should not > >be limited by CONFIG_KPROBES. > > > > So I introduced CONFIG_FUNCTION_ERROR_INJECTION for this feature. > > Also CONFIG_FAIL_FUNCTION is added, which provides function-based > > error injection interface via debugfs following fault-injection > > framework. See [4/4]. > > > > Any thoughts? > > Sorry Masami, I've been on vacation for the last two weeks. This approach is > fine by me, if we want to allow other mechanisms other than bpf to use this > functionality then hooray. I'll do a proper review when you post v3, just > wanted to let you know I wasn't ignoring you. Thanks, Yeah, thank you for the kindful notice ;) BTW, could you tell me how I can run your test case? When I tried to build the tests (samples/bpf) I got below error and stopped. [mhiramat@devbox bpf]$ LANG=C make make -C ../../ /home/mhiramat/ksrc/linux/samples/bpf/ make[1]: Entering directory '/home/mhiramat/ksrc/linux' CHK include/config/kernel.release CHK include/generated/uapi/linux/version.h CHK include/generated/utsrelease.h CHK include/generated/bounds.h CHK include/generated/timeconst.h CHK include/generated/asm-offsets.h CALLscripts/checksyscalls.sh DESCEND objtool CHK scripts/mod/devicetable-offsets.h HOSTCC /home/mhiramat/ksrc/linux/samples/bpf/test_lru_dist.o /home/mhiramat/ksrc/linux/samples/bpf/test_lru_dist.c:39:8: error: redefinition of 'struct list_head' struct list_head { ^ In file included from /home/mhiramat/ksrc/linux/samples/bpf/test_lru_dist.c:9:0: ./tools/include/linux/types.h:69:8: note: originally defined here struct list_head { ^ make[2]: *** [scripts/Makefile.host:107: /home/mhiramat/ksrc/linux/samples/bpf/test_lru_dist.o] Error 1 make[1]: *** [Makefile:1675: /home/mhiramat/ksrc/linux/samples/bpf/] Error 2 make[1]: Leaving directory '/home/mhiramat/ksrc/linux' make: *** [Makefile:204: all] Error 2 Thank you, -- Masami Hiramatsu
Re: linux-next: manual merge of the net-next tree with the bpf tree
On Tue, Jan 09, 2018 at 11:21:25AM +1100, Stephen Rothwell wrote: > Hi all, > > Today's linux-next merge of the net-next tree got a conflict in: > > tools/testing/selftests/bpf/test_align.c > > between commit: > > 2b36047e7889 ("selftests/bpf: fix test_align") > > from the bpf tree and commit: > > 6a28b446b7d2 ("selftests/bpf: adjust test_align expected output") > > from the net-next tree. > > I fixed it up (see below) and can carry the fix as necessary. This > is now fixed as far as linux-next is concerned, but any non trivial > conflicts should be mentioned to your upstream maintainer when your tree > is submitted for merging. You may also want to consider cooperating > with the maintainer of the conflicting tree to minimise any particularly > complex conflicts. > > -- > Cheers, > Stephen Rothwell > > diff --cc tools/testing/selftests/bpf/test_align.c > index 471bbbdb94db,fe916d29e166.. > --- a/tools/testing/selftests/bpf/test_align.c > +++ b/tools/testing/selftests/bpf/test_align.c > @@@ -473,8 -473,28 +473,8 @@@ static struct bpf_align_test tests[] = > .prog_type = BPF_PROG_TYPE_SCHED_CLS, > .result = REJECT, > .matches = { > - {4, "R5=pkt(id=0,off=0,r=0,imm=0)"}, > + {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"}, thanks. That's correct resolution.
linux-next: manual merge of the net-next tree with the bpf tree
Hi all, Today's linux-next merge of the net-next tree got a conflict in: tools/testing/selftests/bpf/test_align.c between commit: 2b36047e7889 ("selftests/bpf: fix test_align") from the bpf tree and commit: 6a28b446b7d2 ("selftests/bpf: adjust test_align expected output") from the net-next tree. I fixed it up (see below) and can carry the fix as necessary. This is now fixed as far as linux-next is concerned, but any non trivial conflicts should be mentioned to your upstream maintainer when your tree is submitted for merging. You may also want to consider cooperating with the maintainer of the conflicting tree to minimise any particularly complex conflicts. -- Cheers, Stephen Rothwell diff --cc tools/testing/selftests/bpf/test_align.c index 471bbbdb94db,fe916d29e166.. --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/test_align.c @@@ -473,8 -473,28 +473,8 @@@ static struct bpf_align_test tests[] = .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .matches = { - {4, "R5=pkt(id=0,off=0,r=0,imm=0)"}, + {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"}, - /* ptr & 0x40 == either 0 or 0x40 */ - {5, "R5_w=inv(id=0,umax_value=64,var_off=(0x0; 0x40))"}, - /* ptr << 2 == unknown, (4n) */ - {7, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffc))"}, - /* (4n) + 14 == (4n+2). We blow our bounds, because - * the add could overflow. - */ - {8, "R5=inv(id=0,var_off=(0x2; 0xfffc))"}, - /* Checked s>=0 */ - {10, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffc))"}, - /* packet pointer + nonnegative (4n+2) */ - {12, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffc))"}, - {14, "R4=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffc))"}, - /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. - * We checked the bounds, but it might have been able - * to overflow if the packet pointer started in the - * upper half of the address space. - * So we did not get a 'range' on R6, and the access - * attempt will fail. - */ - {16, "R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffc))"}, + /* R5 bitwise operator &= on pointer prohibited */ } }, {
Re: [PATCH 06/18] x86, barrier: stop speculation for failed access_ok
On Mon, Jan 8, 2018 at 3:53 PM, Dan Williams wrote: > > I've been thinking the "and" is only suitable for the array bounds > check, for get_user() we're trying to block speculation past > access_ok() at which point we can only do the lfence? Well, we *could* do the "and", at least for the simple cases (ie the true "get_user()" that integrates the access_ok with the access). IOW, mainly the code in arch/x86/lib/getuser.S. But it probably is a lot simpler to just add the "lfence" to ASM_STAC, because by definition those cases don't tend to be the truly critical ones - people who use those functions tend to do one or two accesses, and the real cost is likely the I$ misses and the D$ miss to get current->addr_limit. Not to mention the "stac" itself, which is much more expensive than the access on current microarchitectures. But something like this *might* work: index c97d935a29e8..7fa3d293beaf 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -38,8 +38,11 @@ .text ENTRY(__get_user_1) mov PER_CPU_VAR(current_task), %_ASM_DX - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + mov TASK_addr_limit(%_ASM_DX),%_ASM_DX + cmp %_ASM_DX,%_ASM_AX jae bad_get_user + or $0xfff,%_ASM_DX + and %_ASM_DX,%_ASM_AX ASM_STAC 1: movzbl (%_ASM_AX),%edx xor %eax,%eax (this only does the one-byte case - the 2/4/8 byte cases are exactly the same). The above is completely untested and might have some stupid thinko/typo, so take it purely as a "example patch" to show the concept, rather than actually do it. But just adding "lfence" to the existing ASM_STAC is a hell of a lot easier, and the performance difference between that trivial patch and the above "let's be clever with 'and'" might not be measurable. I really have no idea how expensive lfence might actually end up being in practice. It's possible that lfence is actually fairly cheap in kernel code, since we tend to not have very high IPC anyway. Linus
Re: [PATCH v2] openvswitch: Trim off padding before L3+ netfilter processing
On Sat, Jan 6, 2018 at 10:57 AM, Pravin Shelar wrote: > On Fri, Jan 5, 2018 at 10:59 PM, Ed Swierk wrote: >> >> >> On Jan 5, 2018 22:17, "Pravin Shelar" wrote: >> >> On Fri, Jan 5, 2018 at 3:20 PM, Ed Swierk >> wrote: >>> On Fri, Jan 5, 2018 at 10:14 AM, Ed Swierk >>> wrote: On Thu, Jan 4, 2018 at 7:36 PM, Pravin Shelar wrote: > OVS already pull all required headers in skb linear data, so no need > to redo all of it. only check required is the ip-checksum validation. > I think we could avoid it in most of cases by checking skb length to > ipheader length before verifying the ip header-checksum. Shouldn't the IP header checksum be verified even earlier, like in key_extract(), before actually using any of the fields in the IP header? >>> >>> Something like this for verifying the IP header checksum (not tested): >>> >> AFAIU openflow does not need this verification, so it is not required >> in flow extract. >> >> >> Okay. How about my proposed trimming implementation, caching the pad length >> in the ovs cb? >> > Caching the length is not that simple, OVS actions can change the > length. Keeping it consistent with packet would be more work, so lets > calculate it in ovs-ct function. You could make it specific for skb-len-trimming, something like boolean flag. so that it is easy to reason with.
Re: [PATCH net-next 2/2] openvswitch: add erspan version II support
On Fri, Jan 5, 2018 at 2:29 PM, William Tu wrote: > The patch adds support for configuring the erspan version II > fields for openvswitch. > The patch looks good, But it could change userspace API for OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, how are we going to handle compatibility? > Signed-off-by: William Tu > --- > include/uapi/linux/openvswitch.h | 12 +++- > net/openvswitch/flow_netlink.c | 125 > +++ > 2 files changed, 126 insertions(+), 11 deletions(-) > > diff --git a/include/uapi/linux/openvswitch.h > b/include/uapi/linux/openvswitch.h > index 4265d7f9e1f2..3b1950c59a0c 100644 > --- a/include/uapi/linux/openvswitch.h > +++ b/include/uapi/linux/openvswitch.h > @@ -273,6 +273,16 @@ enum { > > #define OVS_VXLAN_EXT_MAX (__OVS_VXLAN_EXT_MAX - 1) > > +enum { > + OVS_ERSPAN_OPT_UNSPEC, > + OVS_ERSPAN_OPT_IDX, /* be32 index */ > + OVS_ERSPAN_OPT_VER, /* u8 version number */ > + OVS_ERSPAN_OPT_DIR, /* u8 direction */ > + OVS_ERSPAN_OPT_HWID,/* u8 hardware ID */ > + __OVS_ERSPAN_OPT_MAX, > +}; > + > +#define OVS_ERSPAN_OPT_MAX (__OVS_ERSPAN_OPT_MAX - 1) > > /* OVS_VPORT_ATTR_OPTIONS attributes for tunnels. > */ > @@ -363,7 +373,7 @@ enum ovs_tunnel_key_attr { > OVS_TUNNEL_KEY_ATTR_IPV6_SRC, /* struct in6_addr src IPv6 > address. */ > OVS_TUNNEL_KEY_ATTR_IPV6_DST, /* struct in6_addr dst IPv6 > address. */ > OVS_TUNNEL_KEY_ATTR_PAD, > - OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,/* be32 ERSPAN index. */ > + OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,/* Nested OVS_ERSPAN_OPT_* */ > __OVS_TUNNEL_KEY_ATTR_MAX > }; > > diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c > index bce1f78b0de5..696198cf3765 100644 > --- a/net/openvswitch/flow_netlink.c > +++ b/net/openvswitch/flow_netlink.c > @@ -334,8 +334,10 @@ size_t ovs_tun_key_attr_size(void) > * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. > */ > + nla_total_size(2)/* OVS_TUNNEL_KEY_ATTR_TP_SRC */ > - + nla_total_size(2)/* OVS_TUNNEL_KEY_ATTR_TP_DST */ > - + nla_total_size(4); /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */ > + + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ > + /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS is mutually exclusive with > +* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. > +*/ > } > > static size_t ovs_nsh_key_attr_size(void) > @@ -386,6 +388,13 @@ static const struct ovs_len_tbl > ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = > [OVS_VXLAN_EXT_GBP] = { .len = sizeof(u32) }, > }; > > +static const struct ovs_len_tbl ovs_erspan_opt_lens[OVS_ERSPAN_OPT_MAX + 1] > = { > + [OVS_ERSPAN_OPT_IDX]= { .len = sizeof(u32) }, > + [OVS_ERSPAN_OPT_VER]= { .len = sizeof(u8) }, > + [OVS_ERSPAN_OPT_DIR]= { .len = sizeof(u8) }, > + [OVS_ERSPAN_OPT_HWID] = { .len = sizeof(u8) }, > +}; > + > static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX > + 1] = { > [OVS_TUNNEL_KEY_ATTR_ID]= { .len = sizeof(u64) }, > [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) }, > @@ -402,7 +411,8 @@ static const struct ovs_len_tbl > ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] > .next = > ovs_vxlan_ext_key_lens }, > [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct > in6_addr) }, > [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct > in6_addr) }, > - [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = sizeof(u32) }, > + [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = OVS_ATTR_NESTED, > + .next = ovs_erspan_opt_lens }, > }; > > static const struct ovs_len_tbl > @@ -640,16 +650,78 @@ static int erspan_tun_opt_from_nlattr(const struct > nlattr *attr, > { > unsigned long opt_key_offset; > struct erspan_metadata opts; > + struct nlattr *a; > + u16 hwid, dir; > + int rem; > > BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); > > memset(&opts, 0, sizeof(opts)); > - opts.u.index = nla_get_be32(attr); > + nla_for_each_nested(a, attr, rem) { > + int type = nla_type(a); > > - /* Index has only 20-bit */ > - if (ntohl(opts.u.index) & ~INDEX_MASK) { > - OVS_NLERR(log, "ERSPAN index number %x too large.", > - ntohl(opts.u.index)); > + if (type > OVS_ERSPAN_OPT_MAX) { > + OVS_NLERR(log, "ERSPAN option %d out of range max %d", > + type, OVS_ERSPAN_OPT_MAX); > + return -EINVAL; > + } > + > + if (!check_attr_len(nla_len(a), > +
Re: [PATCH bpf-next] bpf: introduce BPF_JIT_ALWAYS_ON config
On Mon, 8 Jan 2018 22:59:04 +0100, Daniel Borkmann wrote: > > @@ -1453,6 +1457,11 @@ struct bpf_prog *bpf_prog_select_runtime(struct > > bpf_prog *fp, int *err) > > */ > > *err = bpf_check_tail_call(fp); > > > > +#ifdef CONFIG_BPF_JIT_ALWAYS_ON > > + if (!fp->jited) > > + *err = -ENOTSUPP; > > +#endif I think programs JITed for offload won't have fp->jited set, but those are pretty safe from CPU bugs. Should we set fp->jited = 1; in bpf_prog_offload_compile()? Just throwing "&& !bpf_prog_is_dev_bound()" in here seems cleaner to me. FWIW if you have netdevsim compiled and recent iproute2, this will work to check: # ip link add type netdevsim # ip link set netdevsim0 xdpoffload obj ~/xdp/pass.o
Re: [PATCH v3 bpf] bpf: prevent out-of-bounds speculation
On 01/08/2018 02:33 AM, Alexei Starovoitov wrote: > Under speculation, CPUs may mis-predict branches in bounds checks. Thus, > memory accesses under a bounds check may be speculated even if the > bounds check fails, providing a primitive for building a side channel. > > To avoid leaking kernel data round up array-based maps and mask the index > after bounds check, so speculated load with out of bounds index will load > either valid value from the array or zero from the padded area. > > Unconditionally mask index for all array types even when max_entries > are not rounded to power of 2 for root user. > When map is created by unpriv user generate a sequence of bpf insns > that includes AND operation to make sure that JITed code includes > the same 'index & index_mask' operation. > > If prog_array map is created by unpriv user replace > bpf_tail_call(ctx, map, index); > with > if (index >= max_entries) { > index &= map->index_mask; > bpf_tail_call(ctx, map, index); > } > (along with roundup to power 2) to prevent out-of-bounds speculation. > There is secondary redundant 'if (index >= max_entries)' in the interpreter > and in all JITs, but they can be optimized later if necessary. > > Other array-like maps (cpumap, devmap, sockmap, perf_event_array, > cgroup_array) > cannot be used by unpriv, so no changes there. > > That fixes bpf side of "Variant 1: bounds check bypass (CVE-2017-5753)" on > all architectures with and without JIT. > > v2->v3: > Daniel noticed that attack potentially can be crafted via syscall commands > without loading the program, so add masking to those paths as well. > > Signed-off-by: Alexei Starovoitov > Acked-by: John Fastabend Applied to bpf tree, thanks Alexei!
Re: [PATCH 06/18] x86, barrier: stop speculation for failed access_ok
On Mon, Jan 8, 2018 at 3:44 PM, Linus Torvalds wrote: > On Mon, Jan 8, 2018 at 1:09 PM, Dan Williams wrote: >> On Sat, Jan 6, 2018 at 5:20 PM, Linus Torvalds >> wrote: >>> On Sat, Jan 6, 2018 at 3:31 PM, Dan Williams >>> wrote: I assume if we put this in uaccess_begin() we also need audit for paths that use access_ok but don't do on to call uaccess_begin()? A quick glance shows a few places where we are open coding the stac(). Perhaps land the lfence in stac() directly? >>> >>> Yeah, we should put it in uaccess_begin(), and in the actual user >>> accessor helpers that do stac. Some of them probably should be changed >>> to use uaccess_begin() instead while at it. >>> >>> One question for the CPU people: do we actually care and need to do >>> this for things that might *write* to something? The speculative write >>> obviously is killed, but does it perhaps bring in a cacheline even >>> when killed? >> >> As far as I understand a write could trigger a request-for-ownership >> read for the target cacheline. > > Oh, absolutely. > > I just wonder at what point that happens. > > Honestly, trying to get exclusive access to a cacheline can be _very_ > expensive (not just for the local thread), so I would actually expect > that doing so for speculative writes is actually bad for performance. > > That's doubly true because - unlike reads - there is no critical > latency issue, so trying to get the cache access started as early as > possible simply isn't all that important. > > So I suspect that a write won't actually try to allocate the cacheline > until the write has actually retired. > > End result: writes - unlike reads - *probably* will not speculatively > perturb the cache with speculative write addresses. > >> Even though writes can trigger reads, as far as I can see the write >> needs to be dependent on the first out-of-bounds read > > Yeah. A write on its own wouldn't matter, even if it were to perturb > the cache state, because the address already comes from user space, so > there's no new information in the cache perturbation for the attacker. > > But that all implies that we shouldn't need the lfence for the > "put_user()" case, only for the get_user() (where the value we read > would then perhaps be used to do another access). > > So we want to add the lfence (or "and") to get_user(), but not > necessarily put_user(). Yes, perhaps __uaccess_begin_get() and __uaccess_begin_put() to keep things separate? > Agreed? I've been thinking the "and" is only suitable for the array bounds check, for get_user() we're trying to block speculation past access_ok() at which point we can only do the lfence?
Re: [PATCH 06/18] x86, barrier: stop speculation for failed access_ok
On Mon, Jan 8, 2018 at 1:09 PM, Dan Williams wrote: > On Sat, Jan 6, 2018 at 5:20 PM, Linus Torvalds > wrote: >> On Sat, Jan 6, 2018 at 3:31 PM, Dan Williams >> wrote: >>> >>> I assume if we put this in uaccess_begin() we also need audit for >>> paths that use access_ok but don't do on to call uaccess_begin()? A >>> quick glance shows a few places where we are open coding the stac(). >>> Perhaps land the lfence in stac() directly? >> >> Yeah, we should put it in uaccess_begin(), and in the actual user >> accessor helpers that do stac. Some of them probably should be changed >> to use uaccess_begin() instead while at it. >> >> One question for the CPU people: do we actually care and need to do >> this for things that might *write* to something? The speculative write >> obviously is killed, but does it perhaps bring in a cacheline even >> when killed? > > As far as I understand a write could trigger a request-for-ownership > read for the target cacheline. Oh, absolutely. I just wonder at what point that happens. Honestly, trying to get exclusive access to a cacheline can be _very_ expensive (not just for the local thread), so I would actually expect that doing so for speculative writes is actually bad for performance. That's doubly true because - unlike reads - there is no critical latency issue, so trying to get the cache access started as early as possible simply isn't all that important. So I suspect that a write won't actually try to allocate the cacheline until the write has actually retired. End result: writes - unlike reads - *probably* will not speculatively perturb the cache with speculative write addresses. > Even though writes can trigger reads, as far as I can see the write > needs to be dependent on the first out-of-bounds read Yeah. A write on its own wouldn't matter, even if it were to perturb the cache state, because the address already comes from user space, so there's no new information in the cache perturbation for the attacker. But that all implies that we shouldn't need the lfence for the "put_user()" case, only for the get_user() (where the value we read would then perhaps be used to do another access). So we want to add the lfence (or "and") to get_user(), but not necessarily put_user(). Agreed? Linus
Re: [PATCH bpf] selftests/bpf: fix test_align
On 1/8/18 8:38 AM, Edward Cree wrote: On 05/01/18 23:02, Alexei Starovoitov wrote: since commit 82abbf8d2fc4 the verifier rejects the bit-wise arithmetic on pointers earlier. The test 'dubious pointer arithmetic' now has less output to match on. Adjust it. Fixes: 82abbf8d2fc4 ("bpf: do not allow root to mangle valid pointers") Reported-by: kernel test robot Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_align.c | 22 +- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c index 8591c89c0828..471bbbdb94db 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/test_align.c @@ -474,27 +474,7 @@ static struct bpf_align_test tests[] = { .result = REJECT, .matches = { {4, "R5=pkt(id=0,off=0,r=0,imm=0)"}, - /* ptr & 0x40 == either 0 or 0x40 */ - {5, "R5=inv(id=0,umax_value=64,var_off=(0x0; 0x40))"}, - /* ptr << 2 == unknown, (4n) */ - {7, "R5=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffc))"}, - /* (4n) + 14 == (4n+2). We blow our bounds, because -* the add could overflow. -*/ - {8, "R5=inv(id=0,var_off=(0x2; 0xfffc))"}, - /* Checked s>=0 */ - {10, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffc))"}, - /* packet pointer + nonnegative (4n+2) */ - {12, "R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffc))"}, - {14, "R4=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffc))"}, - /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. -* We checked the bounds, but it might have been able -* to overflow if the packet pointer started in the -* upper half of the address space. -* So we did not get a 'range' on R6, and the access -* attempt will fail. -*/ - {16, "R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffc))"}, + /* R5 bitwise operator &= on pointer prohibited */ } }, { Rather than neutering this test, we should change it to keep the part where it tests that a large pkt_ptr offset prevents us getting a reg->range. Specifically, in this test we have r2 = pkt r5 = large unknown scalar r6 = r2 + r5 r4 = r6 + 4 Then we check r4 < pkt_end, which normally would give r6->range = 4, but in this case must not do so since r6 could be (u64)(-2) in which case r4 = 2 < pkt_end despite r6 not pointing into the packet. AFAICT there is not other coverage of this case in test_align, and I don't recall such a test being in test_verifier either. So please instead replace the insns that do prohibited ops on pointers with some other way of creating a large unknown scalar, and keep the rest of the test case intact. makes sense. will send a follow up patch when security dust settles.
Re: [PATCH bpf] bpf: prevent out-of-bounds speculation
On 1/8/18 9:05 AM, Mark Rutland wrote: Hi Alexei, On Thu, Jan 04, 2018 at 08:28:11PM -0800, Alexei Starovoitov wrote: From: Alexei Starovoitov Under speculation, CPUs may mis-predict branches in bounds checks. Thus, memory accesses under a bounds check may be speculated even if the bounds check fails, providing a primitive for building a side channel. To avoid leaking kernel data round up array-based maps and mask the index after bounds check, so speculated load with out of bounds index will load either valid value from the array or zero from the padded area. Thanks for putting this together, this certainly looks neat. I'm a little worried that in the presence of some CPU/compiler optimisations, the masking may effectively be skipped under speculation. So I'm not sure how robust this is going to be. More on that below. To avoid duplicating map_lookup functions for root/unpriv always generate a sequence of bpf instructions equivalent to map_lookup function for array and array_of_maps map types when map was created by unpriv user. And unconditionally mask index for percpu_array, since it's fast enough, even when max_entries are not rounded to power of 2 for root user, since percpu_array doesn't have map_gen_lookup callback yet. Is there a noticeable slowdown from the masking? Can't we always have that in place? right. Please see v3 version: https://patchwork.ozlabs.org/patch/856645/ Daniel noticed that speculation can happen without program being loaded and we need to tighten the path via syscall as well. so v3 is doing masking for all array types unconditionally. The perf cost is within noise for interpreter and not seen with JITed root code, since gen_lookup does not emit AND for root. @@ -157,7 +175,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return this_cpu_ptr(array->pptrs[index]); + return this_cpu_ptr(array->pptrs[index & array->index_mask]); As above, I think this isn't necessarily robust, as CPU/compiler optimisations can break the dependency on the index_mask, allowing speculation without a mask. e.g. a compiler could re-write this as: if (array->index_mask != 0x) index &= array->index_mask; return this_cpu_ptr(array->pptrs[index]); ... which would allow an unmasked index to be used in speculated paths. prior to kernel I've been working on sun, gcc, llvm compilers and I've never seen such optimization ever proposed for AND. It makes no sense. For heavy ALU like div/mod and calls compiler does indeed try to predict the value. de-virtualization is an example optimization for indirect calls. Intel compiler pioneered this approach back in 2000. Compilers can also optimize "div by X" into if (x == const) unroll div by const into something faster; else div by X Such optimizations are rarely done without profile feedback, since branch is costly the compiler will add a branch only if there is a clear win from introducing it instead of doing the operation. For and, or, shift, add, sub there is never a case to do so. Instead compiler is always trying to remove branches instead of introducing them. Similar cases could occur with some CPU implementations. For example, HW value-prediction could result in the use of an all-ones mask under speculation. please see the paper that Alan mentioned. HW value speculation predicts likely valid values. It makes no sense for HW to continue speculative execution with random value. Consider array[index & index_mask] if load index_mask stalls and cpu decides to continue speculation with random value (both zero and are considered random) it will proceed through AND and second load will populate the precious cache with completely irrelevant data. Such cpu will be slower with speculative execution than without, since it populates the caches with random data. I think that we may need to be able to provide an arch-specific pointer sanitization sequence (though we could certainly have masking as the default). I still don't understand where this paranoia is coming from. Kernel doesn't need to kill speculation. It needs to manage it. I have a rough idea as to how that could be plumbed into the JIT. First I need to verify the sequence I have in mind for arm/arm64 is sufficient. hmm? the patch provided (both v2 and v3) doesn't need any JIT changes on either x64, arm, etc. gen_lookup() emits BPF_AND on index that JIT converts into actual AND in native instruction set.
[PATCH next-queue 2/2] ixgbe: add unlikely notes to tx fastpath expressions
Add unlikely() to a few error checking expressions in the Tx offload handling. Suggested-by: Yanjun Zhu Signed-off-by: Shannon Nelson --- drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index 57c10e6..3d069a2 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -749,28 +749,28 @@ int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring, struct xfrm_state *xs; struct tx_sa *tsa; - if (!first->skb->sp->len) { + if (unlikely(!first->skb->sp->len)) { netdev_err(tx_ring->netdev, "%s: no xfrm state len = %d\n", __func__, first->skb->sp->len); return 0; } xs = xfrm_input_state(first->skb); - if (!xs) { + if (unlikely(!xs)) { netdev_err(tx_ring->netdev, "%s: no xfrm_input_state() xs = %p\n", __func__, xs); return 0; } itd->sa_idx = xs->xso.offload_handle - IXGBE_IPSEC_BASE_TX_INDEX; - if (itd->sa_idx > IXGBE_IPSEC_MAX_SA_COUNT) { + if (unlikely(itd->sa_idx > IXGBE_IPSEC_MAX_SA_COUNT)) { netdev_err(tx_ring->netdev, "%s: bad sa_idx=%d handle=%lu\n", __func__, itd->sa_idx, xs->xso.offload_handle); return 0; } tsa = &ipsec->tx_tbl[itd->sa_idx]; - if (!tsa->used) { + if (unlikely(!tsa->used)) { netdev_err(tx_ring->netdev, "%s: unused sa_idx=%d\n", __func__, itd->sa_idx); return 0; -- 2.7.4
[PATCH next-queue 1/2] ixgbe: fix clean hw loop count
Fix a cut-paste error so that we can clean all the table entries. Signed-off-by: Shannon Nelson --- drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index 12c7132..57c10e6 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -148,7 +148,7 @@ static void ixgbe_ipsec_clear_hw_tables(struct ixgbe_adapter *adapter) ixgbe_ipsec_set_rx_sa(hw, idx, 0, buf, 0, 0, 0); ixgbe_ipsec_set_rx_ip(hw, idx, (__be32 *)buf); } - for (; idx < IXGBE_IPSEC_MAX_RX_IP_COUNT; idx++) { + for (; idx < IXGBE_IPSEC_MAX_SA_COUNT; idx++) { ixgbe_ipsec_set_tx_sa(hw, idx, buf, 0); ixgbe_ipsec_set_rx_sa(hw, idx, 0, buf, 0, 0, 0); } -- 2.7.4
RE: [Intel-wired-lan] [PATCH 01/27] timecounter: Make cyclecounter struct part of timecounter struct
> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@osuosl.org] On > Behalf Of Sagar Arun Kamble > Sent: Thursday, December 14, 2017 11:38 PM > To: linux-ker...@vger.kernel.org > Cc: alsa-de...@alsa-project.org; linux-r...@vger.kernel.org; > netdev@vger.kernel.org; Richard Cochran ; > Stephen Boyd ; Chris Wilson wilson.co.uk>; John Stultz ; intel-wired- > l...@lists.osuosl.org; Thomas Gleixner ; Kamble, Sagar A > ; kvm...@lists.cs.columbia.edu; linux-arm- > ker...@lists.infradead.org > Subject: [Intel-wired-lan] [PATCH 01/27] timecounter: Make cyclecounter > struct part of timecounter struct > > There is no real need for the users of timecounters to define cyclecounter > and timecounter variables separately. Since timecounter will always be > based on cyclecounter, have cyclecounter struct as member of timecounter > struct. > > v2: Rebase. > > Suggested-by: Chris Wilson > Signed-off-by: Sagar Arun Kamble > Cc: Chris Wilson > Cc: Richard Cochran > Cc: John Stultz > Cc: Thomas Gleixner > Cc: Stephen Boyd > Cc: linux-ker...@vger.kernel.org > Cc: linux-arm-ker...@lists.infradead.org > Cc: netdev@vger.kernel.org > Cc: intel-wired-...@lists.osuosl.org > Cc: linux-r...@vger.kernel.org > Cc: alsa-de...@alsa-project.org > Cc: kvm...@lists.cs.columbia.edu > Acked-by: Jeff Kirsher (Intel drivers) > --- > arch/microblaze/kernel/timer.c | 20 ++-- > drivers/clocksource/arm_arch_timer.c | 19 ++-- > drivers/net/ethernet/amd/xgbe/xgbe-dev.c | 3 +- > drivers/net/ethernet/amd/xgbe/xgbe-ptp.c | 9 +++--- > drivers/net/ethernet/amd/xgbe/xgbe.h | 1 - > drivers/net/ethernet/broadcom/bnx2x/bnx2x.h| 1 - > drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 20 ++-- > drivers/net/ethernet/freescale/fec.h | 1 - > drivers/net/ethernet/freescale/fec_ptp.c | 30 +- > drivers/net/ethernet/intel/e1000e/e1000.h | 1 - > drivers/net/ethernet/intel/e1000e/netdev.c | 27 > drivers/net/ethernet/intel/e1000e/ptp.c| 2 +- > drivers/net/ethernet/intel/igb/igb.h | 1 - > drivers/net/ethernet/intel/igb/igb_ptp.c | 25 --- > drivers/net/ethernet/intel/ixgbe/ixgbe.h | 1 - > drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c | 17 +- > drivers/net/ethernet/mellanox/mlx4/en_clock.c | 28 - > drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 1 - > .../net/ethernet/mellanox/mlx5/core/lib/clock.c| 34 ++-- > drivers/net/ethernet/qlogic/qede/qede_ptp.c| 20 ++-- > drivers/net/ethernet/ti/cpts.c | 36 > -- > drivers/net/ethernet/ti/cpts.h | 1 - > include/linux/mlx5/driver.h| 1 - > include/linux/timecounter.h| 4 +-- > include/sound/hdaudio.h| 1 - > kernel/time/timecounter.c | 28 - > sound/hda/hdac_stream.c| 7 +++-- > virt/kvm/arm/arch_timer.c | 6 ++-- > 28 files changed, 163 insertions(+), 182 deletions(-) > For Intel e1000e and igb drivers: Tested-by: Aaron Brown
Re: dvb usb issues since kernel 4.9
On Mon, 8 Jan 2018 22:44:27 +0100 Peter Zijlstra wrote: > On Mon, Jan 08, 2018 at 10:31:09PM +0100, Jesper Dangaard Brouer wrote: > > I did expected the issue to get worse, when you load the Pi with > > network traffic, as now the softirq time-budget have to be shared > > between networking and USB/DVB. Thus, I guess you are running TCP and > > USB/mpeg2ts on the same CPU (why when you have 4 CPUs?...) > > Isn't networking also over USB on the Pi ? Darn, that is true. Looking at the dmesg output in http://ix.io/DOg: [0.405942] usbcore: registered new interface driver smsc95xx [5.821104] smsc95xx 1-1.1:1.0 eth0: link up, 100Mbps, full-duplex, lpa 0x45E1 I don't know enough about USB... is it possible to control which CPU handles the individual USB ports, or on some other level (than ports)? -- Best regards, Jesper Dangaard Brouer MSc.CS, Principal Kernel Engineer at Red Hat LinkedIn: http://www.linkedin.com/in/brouer
Re: [PATCH bpf-next] bpf: introduce BPF_JIT_ALWAYS_ON config
On 01/08/2018 04:35 AM, Alexei Starovoitov wrote: > The BPF interpreter has been used as part of the spectre 2 attack > CVE-2017-5715. > > A quote from goolge project zero blog: > "At this point, it would normally be necessary to locate gadgets in > the host kernel code that can be used to actually leak data by reading > from an attacker-controlled location, shifting and masking the result > appropriately and then using the result of that as offset to an > attacker-controlled address for a load. But piecing gadgets together > and figuring out which ones work in a speculation context seems annoying. > So instead, we decided to use the eBPF interpreter, which is built into > the host kernel - while there is no legitimate way to invoke it from inside > a VM, the presence of the code in the host kernel's text section is sufficient > to make it usable for the attack, just like with ordinary ROP gadgets." > > To make attacker job harder introduce BPF_JIT_ALWAYS_ON config > option that removes interpreter from the kernel in favor of JIT-only mode. > So far eBPF JIT is supported by: > x64, arm64, arm32, sparc64, s390, powerpc64, mips64 > > The start of JITed program is randomized and code page is marked as read-only. > In addition "constant blinding" can be turned on with net.core.bpf_jit_harden > > Signed-off-by: Alexei Starovoitov > --- > init/Kconfig | 7 +++ > kernel/bpf/core.c | 9 + > kernel/bpf/verifier.c | 4 > net/core/sysctl_net_core.c | 9 + > 4 files changed, 29 insertions(+) > > diff --git a/init/Kconfig b/init/Kconfig > index 2934249fba46..5e2a4a391ba9 100644 > --- a/init/Kconfig > +++ b/init/Kconfig > @@ -1392,6 +1392,13 @@ config BPF_SYSCALL > Enable the bpf() system call that allows to manipulate eBPF > programs and maps via file descriptors. > > +config BPF_JIT_ALWAYS_ON > + bool "Permanently enable BPF JIT and remove BPF interpreter" > + depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT > + help > + Enables BPF JIT and removes BPF interpreter to avoid > + speculative execution of BPF instructions by the interpreter > + > config USERFAULTFD > bool "Enable userfaultfd() system call" > select ANON_INODES > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 70a534549cd3..42756c434e0b 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -781,6 +781,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 > r4, u64 r5) > } > EXPORT_SYMBOL_GPL(__bpf_call_base); > > +#ifndef CONFIG_BPF_JIT_ALWAYS_ON > /** > * __bpf_prog_run - run eBPF program on a given context > * @ctx: is the data we are operating on > @@ -1376,6 +1377,7 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 > stack_depth) > __bpf_call_base_args; > insn->code = BPF_JMP | BPF_CALL_ARGS; > } > +#endif > > bool bpf_prog_array_compatible(struct bpf_array *array, > const struct bpf_prog *fp) > @@ -1427,9 +1429,11 @@ static int bpf_check_tail_call(const struct bpf_prog > *fp) > */ > struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) > { > +#ifndef CONFIG_BPF_JIT_ALWAYS_ON > u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); > > fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; > +#endif > > /* eBPF JITs can rewrite the program in case constant >* blinding is active. However, in case of error during > @@ -1453,6 +1457,11 @@ struct bpf_prog *bpf_prog_select_runtime(struct > bpf_prog *fp, int *err) >*/ > *err = bpf_check_tail_call(fp); > > +#ifdef CONFIG_BPF_JIT_ALWAYS_ON > + if (!fp->jited) > + *err = -ENOTSUPP; > +#endif This part here and ... > return fp; > } > EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); [...] > @@ -524,6 +530,9 @@ static __net_initdata struct pernet_operations > sysctl_core_ops = { > > static __init int sysctl_core_init(void) > { > +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_JIT_ALWAYS_ON) > + bpf_jit_enable = 1; > +#endif ... this one will race and break stuff in the current shape, one example is the PTP classifier in the tree: sysctl_core_init() is done in fs_initcall(), whereas ptp_classifier_init() is done in sock_init() which is done out of core_initcall(). So what will happen is that at this point in time bpf_jit_enable is not yet set to 1, so when ptp_classifier_init() calls the cBPF bpf_prog_create(), it will migrate the insns over to eBPF and in bpf_prog_select_runtime() called from bpf_migrate_filter() have the assumption that we always succeed here since when JIT fails, we will fall back to the interpreter anyway. The only error up until now in bpf_prog_select_runtime() that could happen is out of native eBPF prog load, so bpf_migrate_filter() will thus return just fine and on first call to PTP classifier from a network packet, we'll get NULL pointer deref since the fp->b
Re: dvb usb issues since kernel 4.9
On Mon, Jan 08, 2018 at 10:31:09PM +0100, Jesper Dangaard Brouer wrote: > I did expected the issue to get worse, when you load the Pi with > network traffic, as now the softirq time-budget have to be shared > between networking and USB/DVB. Thus, I guess you are running TCP and > USB/mpeg2ts on the same CPU (why when you have 4 CPUs?...) Isn't networking also over USB on the Pi ?
Re: dvb usb issues since kernel 4.9
On Mon, 8 Jan 2018 17:26:10 +0100 "Josef Griebichler" wrote: > I tried your mentioned patch but unfortunately no real improvement for me. > dmesg http://ix.io/DOg > tvheadend service log http://ix.io/DOi > > Errors during recording are still there. Are you _also_ recording the stream on the Raspberry Pi? It seems to me, that you are expecting too much from this small device. > Errors increase if there is additional tcp load on raspberry. I did expected the issue to get worse, when you load the Pi with network traffic, as now the softirq time-budget have to be shared between networking and USB/DVB. Thus, I guess you are running TCP and USB/mpeg2ts on the same CPU (why when you have 4 CPUs?...) If you expect/want to get stable performance out of such a small box, then you (or LibreELEC) need to tune the box for this usage. And it does not have to be that complicated. First step is to move IRQ handling for the NIC to another CPU and than the USB port handling the DVB signal (/proc/irq/*/smp_affinity_list). And then pin the userspace process (taskset) to another CPU than the one handling USB-softirq. > Unfortunately there's no usbmon or tshark on libreelec so I can't > provide further logs. Do you have perf or trace-cmd on the box? Maybe we could come up with some kernel functions to trace, to measure/show the latency spikes? -- Best regards, Jesper Dangaard Brouer MSc.CS, Principal Kernel Engineer at Red Hat LinkedIn: http://www.linkedin.com/in/brouer
[PATCH] wireless: broadcom: radio_2056: delete duplicated macro definitions
Ctrl-V was hit twice when these macros were inserted: $ sed -n '9,527p' ./drivers/net/wireless/broadcom/b43/radio_2056.h | md5sum 4db53450c59d9939e903d4e4ba6bc9b1 - $ sed -n '528,1046p' ./drivers/net/wireless/broadcom/b43/radio_2056.h | md5sum 4db53450c59d9939e903d4e4ba6bc9b1 - Signed-off-by: Rasmus Villemoes --- drivers/net/wireless/broadcom/b43/radio_2056.h | 519 - 1 file changed, 519 deletions(-) diff --git a/drivers/net/wireless/broadcom/b43/radio_2056.h b/drivers/net/wireless/broadcom/b43/radio_2056.h index 59297fdce5e3..779b80ea072f 100644 --- a/drivers/net/wireless/broadcom/b43/radio_2056.h +++ b/drivers/net/wireless/broadcom/b43/radio_2056.h @@ -525,525 +525,6 @@ #define B2056_VCM_MASK 0x1C #define B2056_RSSI_VCM_SHIFT 0x02 -#define B2056_SYN (0x0 << 12) -#define B2056_TX0 (0x2 << 12) -#define B2056_TX1 (0x3 << 12) -#define B2056_RX0 (0x6 << 12) -#define B2056_RX1 (0x7 << 12) -#define B2056_ALLTX(0xE << 12) -#define B2056_ALLRX(0xF << 12) - -#define B2056_SYN_RESERVED_ADDR0 0x00 -#define B2056_SYN_IDCODE 0x01 -#define B2056_SYN_RESERVED_ADDR2 0x02 -#define B2056_SYN_RESERVED_ADDR3 0x03 -#define B2056_SYN_RESERVED_ADDR4 0x04 -#define B2056_SYN_RESERVED_ADDR5 0x05 -#define B2056_SYN_RESERVED_ADDR6 0x06 -#define B2056_SYN_RESERVED_ADDR7 0x07 -#define B2056_SYN_COM_CTRL 0x08 -#define B2056_SYN_COM_PU 0x09 -#define B2056_SYN_COM_OVR 0x0A -#define B2056_SYN_COM_RESET0x0B -#define B2056_SYN_COM_RCAL 0x0C -#define B2056_SYN_COM_RC_RXLPF 0x0D -#define B2056_SYN_COM_RC_TXLPF 0x0E -#define B2056_SYN_COM_RC_RXHPF 0x0F -#define B2056_SYN_RESERVED_ADDR16 0x10 -#define B2056_SYN_RESERVED_ADDR17 0x11 -#define B2056_SYN_RESERVED_ADDR18 0x12 -#define B2056_SYN_RESERVED_ADDR19 0x13 -#define B2056_SYN_RESERVED_ADDR20 0x14 -#define B2056_SYN_RESERVED_ADDR21 0x15 -#define B2056_SYN_RESERVED_ADDR22 0x16 -#define B2056_SYN_RESERVED_ADDR23 0x17 -#define B2056_SYN_RESERVED_ADDR24 0x18 -#define B2056_SYN_RESERVED_ADDR25 0x19 -#define B2056_SYN_RESERVED_ADDR26 0x1A -#define B2056_SYN_RESERVED_ADDR27 0x1B -#define B2056_SYN_RESERVED_ADDR28 0x1C -#define B2056_SYN_RESERVED_ADDR29 0x1D -#define B2056_SYN_RESERVED_ADDR30 0x1E -#define B2056_SYN_RESERVED_ADDR31 0x1F -#define B2056_SYN_GPIO_MASTER1 0x20 -#define B2056_SYN_GPIO_MASTER2 0x21 -#define B2056_SYN_TOPBIAS_MASTER 0x22 -#define B2056_SYN_TOPBIAS_RCAL 0x23 -#define B2056_SYN_AFEREG 0x24 -#define B2056_SYN_TEMPPROCSENSE0x25 -#define B2056_SYN_TEMPPROCSENSEIDAC0x26 -#define B2056_SYN_TEMPPROCSENSERCAL0x27 -#define B2056_SYN_LPO 0x28 -#define B2056_SYN_VDDCAL_MASTER0x29 -#define B2056_SYN_VDDCAL_IDAC 0x2A -#define B2056_SYN_VDDCAL_STATUS0x2B -#define B2056_SYN_RCAL_MASTER 0x2C -#define B2056_SYN_RCAL_CODE_OUT0x2D -#define B2056_SYN_RCCAL_CTRL0 0x2E -#define B2056_SYN_RCCAL_CTRL1 0x2F -#define B2056_SYN_RCCAL_CTRL2 0x30 -#define B2056_SYN_RCCAL_CTRL3 0x31 -#define B2056_SYN_RCCAL_CTRL4 0x32 -#define B2056_SYN_RCCAL_CTRL5 0x33 -#define B2056_SYN_RCCAL_CTRL6 0x34 -#define B2056_SYN_RCCAL_CTRL7 0x35 -#define B2056_SYN_RCCAL_CTRL8 0x36 -#define B2056_SYN_RCCAL_CTRL9 0x37 -#define B2056_SYN_RCCAL_CTRL10 0x38 -#define B2056_SYN_RCCAL_CTRL11 0x39 -#define B2056_SYN_ZCAL_SPARE1 0x3A -#define B2056_SYN_ZCAL_SPARE2 0x3B -#define B2056_SYN_PLL_MAST10x3C -#define B2056_SYN_PLL_MAST20x3D -#define B2056_SYN_PLL_MAST30x3E -#define B2056_SYN_PLL_BIAS_RESET 0x3F -#define B2056_SYN_PLL_XTAL00x40 -#define B2056_SYN_PLL_XTAL10x41 -#define B2056_SYN_PLL_XTAL30x42 -#define B2056_SYN_PLL_XTAL40x43 -#define B2056_SYN_PLL_XTAL50x44 -#define B2056_SYN_PLL_XTAL60x45 -#define B2056_SYN_PLL_REFDIV 0x46 -#define B2056_SYN_PLL_PFD 0x47 -#define B2056_SYN_PLL_CP1 0x48 -#define B2056_SYN_PLL_CP2 0x49 -#define B2056_SYN_PLL_CP3 0x4A -#define B2056_SYN_PLL_LOOPFILTER1 0x4B -#define B2056_SYN_PLL_LOOPFILTER2 0x4C -#define B2056_SYN_PLL_LOOPFILTER3 0x4D -#define B2056_SYN_PLL_LOOPFILTER4 0x4E -#define B2056_SYN_PLL_LOOPFILTER5 0x4F -#define B2056_SYN_PLL_MMD1 0x50 -#define B2056_SYN_PLL_MMD2 0x51 -#define B2056_SYN_PLL_VCO1 0x52 -#define B2056_SYN_PLL_VCO2 0x53 -#define B2056_S
Re: [PATCH 02/18] Documentation: document nospec helpers
On Mon, 8 Jan 2018 17:09:59 + Mark Rutland wrote: > > I have just a couple of overall comments. > > > > - It would be nice if the document were done in RST and placed in the > >core-API manual, perhaps using kerneldoc comments for the macros > >themselves. It's already 99.9% RST now, so the changes required would > >be minimal. > > Is there any quickstart guide to RST that you can recommend? http://docutils.sourceforge.net/docs/user/rst/quickref.html works reasonably well. We have some info in the kernel documentation as well, see http://static.lwn.net/kerneldoc/doc-guide/sphinx.html Thanks, jon
Re: [PATCH 06/18] x86, barrier: stop speculation for failed access_ok
On Sat, Jan 6, 2018 at 5:20 PM, Linus Torvalds wrote: > On Sat, Jan 6, 2018 at 3:31 PM, Dan Williams wrote: >> >> I assume if we put this in uaccess_begin() we also need audit for >> paths that use access_ok but don't do on to call uaccess_begin()? A >> quick glance shows a few places where we are open coding the stac(). >> Perhaps land the lfence in stac() directly? > > Yeah, we should put it in uaccess_begin(), and in the actual user > accessor helpers that do stac. Some of them probably should be changed > to use uaccess_begin() instead while at it. > > One question for the CPU people: do we actually care and need to do > this for things that might *write* to something? The speculative write > obviously is killed, but does it perhaps bring in a cacheline even > when killed? As far as I understand a write could trigger a request-for-ownership read for the target cacheline. > Because maybe we don't need the lfence in put_user(), only in get_user()? Even though writes can trigger reads, as far as I can see the write needs to be dependent on the first out-of-bounds read: if (x < max) y = array1[x]; put_user(array2 + y, z); ...in other words that first read should be annotated with nospec_array_ptr() making an lfence in put_user() or other writes moot. yp = nospec_array_ptr(array1, x, max); if (yp) y = *yp; put_user(array2 + y, z);
[PATCH net 2/3] sctp: add a ceiling to optlen in some sockopts
Hangbin Liu reported that some sockopt calls could cause the kernel to log a warning on memory allocation failure if the user supplied a large optlen value. That is because some of them called memdup_user() without a ceiling on optlen, allowing it to try to allocate really large buffers. This patch adds a ceiling by limiting optlen to the maximum allowed that would still make sense for these sockopt. Reported-by: Hangbin Liu Signed-off-by: Marcelo Ricardo Leitner --- net/sctp/socket.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 54c046783a89e76c9909ee85c83e6be38ada41a7..022b94f11fd8ac0d3b839b16dfc14f86abf2324f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3498,6 +3498,8 @@ static int sctp_setsockopt_hmac_ident(struct sock *sk, if (optlen < sizeof(struct sctp_hmacalgo)) return -EINVAL; + optlen = min_t(unsigned int, optlen, sizeof(struct sctp_hmacalgo) + +SCTP_AUTH_NUM_HMACS * sizeof(u16)); hmacs = memdup_user(optval, optlen); if (IS_ERR(hmacs)) @@ -3536,6 +3538,11 @@ static int sctp_setsockopt_auth_key(struct sock *sk, if (optlen <= sizeof(struct sctp_authkey)) return -EINVAL; + /* authkey->sca_keylength is u16, so optlen can't be bigger than +* this. +*/ + optlen = min_t(unsigned int, optlen, USHRT_MAX + +sizeof(struct sctp_authkey)); authkey = memdup_user(optval, optlen); if (IS_ERR(authkey)) @@ -3893,6 +3900,9 @@ static int sctp_setsockopt_reset_streams(struct sock *sk, if (optlen < sizeof(*params)) return -EINVAL; + /* srs_number_streams is u16, so optlen can't be bigger than this. */ + optlen = min_t(unsigned int, optlen, USHRT_MAX + +sizeof(__u16) * sizeof(*params)); params = memdup_user(optval, optlen); if (IS_ERR(params)) -- 2.14.3