From: Joe Stringer <[email protected]> To receive packets, an eBPF program has to be attached to a netdev through tc ingress/egress, an XDP program has to be attached to a netdev's xdp hook point. The patch introduces two new netdev_class function: set_filter and set_xdp for the purpose. Now two netdev types, netdev-linux and netdev-vport, have the actual implementation.
Signed-off-by: William Tu <[email protected]> Co-authored-by: William Tu <[email protected]> Co-authored-by: Yifeng Sun <[email protected]> --- include/linux/pkt_cls.h | 21 +++ lib/dpif-netdev.c | 29 ++-- lib/netdev-bsd.c | 2 + lib/netdev-dpdk.c | 2 + lib/netdev-dummy.c | 2 + lib/netdev-linux.c | 436 +++++++++++++++++++++++++++++++++++++++++++++++- lib/netdev-linux.h | 2 + lib/netdev-provider.h | 11 ++ lib/netdev-vport.c | 145 +++++++++++++++- lib/netdev.c | 25 +++ lib/netdev.h | 4 + 11 files changed, 655 insertions(+), 24 deletions(-) diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h index f7bc7ea708d7..770af90a5c64 100644 --- a/include/linux/pkt_cls.h +++ b/include/linux/pkt_cls.h @@ -104,6 +104,27 @@ enum { __TCA_BASIC_MAX }; +/* BPF classifier */ + +#define TCA_BPF_FLAG_ACT_DIRECT (1 << 0) + +enum { + TCA_BPF_UNSPEC, + TCA_BPF_ACT, + TCA_BPF_POLICE, + TCA_BPF_CLASSID, + TCA_BPF_OPS_LEN, + TCA_BPF_OPS, + TCA_BPF_FD, + TCA_BPF_NAME, + TCA_BPF_FLAGS, + TCA_BPF_FLAGS_GEN, + TCA_BPF_TAG, + __TCA_BPF_MAX, +}; + +#define TCA_BPF_MAX (__TCA_BPF_MAX - 1) + /* Flower classifier */ enum { diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index ba62128c758c..baff020fe3d0 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -1505,12 +1505,6 @@ dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd) ovs_mutex_unlock(&pmd->cond_mutex); } -static uint32_t -hash_port_no(odp_port_t port_no) -{ - return hash_int(odp_to_u32(port_no), 0); -} - static int port_create(const char *devname, const char *type, odp_port_t port_no, struct dp_netdev_port **portp) @@ -1525,6 +1519,7 @@ port_create(const char *devname, const char *type, /* Open and validate network device. */ error = netdev_open(devname, type, &netdev); + VLOG_INFO("%s %s error %d", __func__, devname, error); if (error) { return error; } @@ -1578,7 +1573,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type, return error; } - hmap_insert(&dp->ports, &port->node, hash_port_no(port_no)); + hmap_insert(&dp->ports, &port->node, netdev_hash_port_no(port_no)); seq_change(dp->port_seq); reconfigure_datapath(dp); @@ -1596,6 +1591,8 @@ dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev, odp_port_t port_no; int error; + VLOG_INFO("%s", __func__); + ovs_mutex_lock(&dp->port_mutex); dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf); if (*port_nop != ODPP_NONE) { @@ -1648,7 +1645,8 @@ dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no) { struct dp_netdev_port *port; - HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) { + HMAP_FOR_EACH_WITH_HASH (port, node, netdev_hash_port_no(port_no), + &dp->ports) { if (port->port_no == port_no) { return port; } @@ -1808,7 +1806,7 @@ dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd, odp_port_t in_port) { struct dpcls *cls; - uint32_t hash = hash_port_no(in_port); + uint32_t hash = netdev_hash_port_no(in_port); CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) { if (cls->in_port == in_port) { /* Port classifier exists already */ @@ -1824,7 +1822,7 @@ dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd, OVS_REQUIRES(pmd->flow_mutex) { struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); - uint32_t hash = hash_port_no(in_port); + uint32_t hash = netdev_hash_port_no(in_port); if (!cls) { /* Create new classifier for in_port */ @@ -3311,7 +3309,7 @@ tx_port_lookup(const struct hmap *hmap, odp_port_t port_no) { struct tx_port *tx; - HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) { + HMAP_FOR_EACH_IN_BUCKET (tx, node, netdev_hash_port_no(port_no), hmap) { if (tx->port->port_no == port_no) { return tx; } @@ -4034,13 +4032,13 @@ pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd) if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) { tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached); hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node, - hash_port_no(tx_port_cached->port->port_no)); + netdev_hash_port_no(tx_port_cached->port->port_no)); } if (netdev_n_txq(tx_port->port->netdev)) { tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached); hmap_insert(&pmd->send_port_cache, &tx_port_cached->node, - hash_port_no(tx_port_cached->port->port_no)); + netdev_hash_port_no(tx_port_cached->port->port_no)); } } } @@ -4793,7 +4791,8 @@ dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, tx->flush_time = 0LL; dp_packet_batch_init(&tx->output_pkts); - hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no)); + hmap_insert(&pmd->tx_ports, &tx->node, + netdev_hash_port_no(tx->port->port_no)); pmd->need_reload = true; } @@ -5965,7 +5964,7 @@ dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED, /* Reinsert with new port number. */ port->port_no = port_no; - hmap_insert(&dp->ports, &port->node, hash_port_no(port_no)); + hmap_insert(&dp->ports, &port->node, netdev_hash_port_no(port_no)); reconfigure_datapath(dp); seq_change(dp->port_seq); diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c index 05974c100895..1460ae2504c5 100644 --- a/lib/netdev-bsd.c +++ b/lib/netdev-bsd.c @@ -1516,6 +1516,8 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum netdev_flags off, NULL, /* set_advertisement */ \ NULL, /* get_pt_mode */ \ NULL, /* set_policing */ \ + NULL, /* set_filter */ \ + NULL, /* set_xdp */ \ NULL, /* get_qos_type */ \ NULL, /* get_qos_capabilities */ \ NULL, /* get_qos */ \ diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 52d8fe6b7ac2..20116c22137e 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -3854,6 +3854,8 @@ unlock: NULL, /* get_pt_mode */ \ \ netdev_dpdk_set_policing, \ + NULL, /* set_filter */ \ + NULL, /* set_xdp */ \ netdev_dpdk_get_qos_types, \ NULL, /* get_qos_capabilities */ \ netdev_dpdk_get_qos, \ diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index 4246af3b9c86..44c9458a9a22 100644 --- a/lib/netdev-dummy.c +++ b/lib/netdev-dummy.c @@ -1427,6 +1427,8 @@ netdev_dummy_update_flags(struct netdev *netdev_, NULL, /* get_pt_mode */ \ \ NULL, /* set_policing */ \ + NULL, /* set_filter */ \ + NULL, /* set_xdp */ \ NULL, /* get_qos_types */ \ NULL, /* get_qos_capabilities */ \ NULL, /* get_qos */ \ diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 4e0473cf331f..121dd3bc738e 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -46,6 +46,9 @@ #include <string.h> #include <unistd.h> +#include <bpf/libbpf.h> /* linux/tools/bpf/libbpf.h */ + +#include "bpf.h" #include "coverage.h" #include "dp-packet.h" #include "dpif-netlink.h" @@ -227,6 +230,9 @@ enum { VALID_VPORT_STAT_ERROR = 1 << 5, VALID_DRVINFO = 1 << 6, VALID_FEATURES = 1 << 7, + VALID_INGRESS_FILTER = 1 << 8, + VALID_EGRESS_FILTER = 1 << 9, + VALID_XDP_FILTER = 1 << 10, }; /* Traffic control. */ @@ -421,6 +427,7 @@ static const struct tc_ops tc_ops_sfq; static const struct tc_ops tc_ops_default; static const struct tc_ops tc_ops_noop; static const struct tc_ops tc_ops_other; +static const struct tc_ops tc_ops_clsact; static const struct tc_ops *const tcs[] = { &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */ @@ -431,6 +438,7 @@ static const struct tc_ops *const tcs[] = { &tc_ops_noop, /* Non operating qos type. */ &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */ &tc_ops_other, /* Some other qdisc. */ + &tc_ops_clsact, /* Classifier with nested action. */ NULL }; @@ -442,8 +450,12 @@ static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *, int type, unsigned int flags, struct ofpbuf *); +static int clsact_install__(struct netdev *netdev_); static int tc_add_policer(struct netdev *, uint32_t kbits_rate, uint32_t kbits_burst); +static int tc_add_filter(struct netdev *, int fd, uint32_t parent, + const char *name); +static bool tc_is_clsact(const struct tc *tc); static int tc_parse_qdisc(const struct ofpbuf *, const char **kind, struct nlattr **options); @@ -485,13 +497,19 @@ struct netdev_linux { long long int carrier_resets; uint32_t kbits_rate; /* Policing data. */ uint32_t kbits_burst; + uint32_t ingress_filter; /* BPF ingress filter fd. */ + uint32_t egress_filter; /* BPF egress filter fd. */ + uint32_t ingress_xdp_filter;/* XDP ingress filter fd. */ int vport_stats_error; /* Cached error code from vport_get_stats(). 0 or an errno value. */ int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */ int ether_addr_error; /* Cached error code from set/get etheraddr. */ int netdev_policing_error; /* Cached error code from set policing. */ + int ingress_filter_error; /* Cached error code from set filter. */ + int egress_filter_error; /* Cached error code from set filter. */ int get_features_error; /* Cached error code from ETHTOOL_GSET. */ int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */ + int ingress_xdp_error; enum netdev_features current; /* Cached from ETHTOOL_GSET. */ enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */ @@ -2159,8 +2177,14 @@ netdev_linux_set_policing(struct netdev *netdev_, if (kbits_rate) { error = tc_add_del_ingress_qdisc(ifindex, true); if (error) { - VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s", - netdev_name, ovs_strerror(error)); + const char *bpf_conflict = ""; + + if (error == EEXIST && (netdev->ingress_filter + || netdev->egress_filter)) { + bpf_conflict = " (conflicts with BPF)"; + } + VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s%s", + netdev_name, ovs_strerror(error), bpf_conflict); goto out; } @@ -2184,6 +2208,268 @@ out: return error; } +/* Attempts to set a BPF filter on the device. Returns 0 if successful, + * otherwise a positive errno value. */ +static int +netdev_linux_set_filter__(struct netdev *netdev_, const struct bpf_prog *prog, + unsigned int valid_bit, int *filter_error, + uint32_t *netdev_filter) +{ + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + const char *netdev_name = netdev_get_name(netdev_); + int error; + + VLOG_DBG("Setting %s filter %d on %s (handle %08"PRIx32")", prog->name, + prog->fd, netdev_name, prog->handle); + + if (netdev->cache_valid & valid_bit) { + error = *filter_error; + if (error || (prog && prog->fd == *netdev_filter)) { + /* Assume that settings haven't changed since we last set them. */ + goto out; + } + netdev->cache_valid &= ~valid_bit; + } + + /* Remove non-clsact qdiscs. */ + if (netdev->tc && !tc_is_clsact(netdev->tc)) { + error = tc_del_qdisc(netdev_); + if (error) { + VLOG_WARN_RL(&rl, "%s: removing qdisc failed: %s", + netdev_name, ovs_strerror(error)); + goto out; + } + } + + if (prog) { + if (!netdev->tc || !tc_is_clsact(netdev->tc)) { + error = clsact_install__(netdev_); + if (error && error != EEXIST) { + VLOG_WARN_RL(&rl, "%s: clsact qdisc setup failed: %s", + netdev_name, ovs_strerror(error)); + goto out; + } + } + + error = tc_add_filter(netdev_, prog->fd, prog->handle, prog->name); + if (error){ + VLOG_WARN_RL(&rl, "%s: adding filter %s failed: %s", + netdev_name, prog->name, ovs_strerror(error)); + goto out; + } + } + + *netdev_filter = prog ? prog->fd : 0; + +out: + if (!error || error == ENODEV) { + *filter_error = error; + netdev->cache_valid |= valid_bit; + } + return error; +} + +static int +netdev_linux_set_filter(struct netdev *netdev_, const struct bpf_prog *prog) +{ + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + int error; + + ovs_mutex_lock(&netdev->mutex); + if (!prog || prog->handle == INGRESS_HANDLE) { + error = netdev_linux_set_filter__(netdev_, prog, VALID_INGRESS_FILTER, + &netdev->ingress_filter_error, + &netdev->ingress_filter); + } else { + error = netdev_linux_set_filter__(netdev_, prog, VALID_EGRESS_FILTER, + &netdev->egress_filter_error, + &netdev->egress_filter); + } + ovs_mutex_unlock(&netdev->mutex); + + return error; +} + +#ifndef SOL_NETLINK +#define SOL_NETLINK 270 +#endif + +/* Extract from libbpf */ +int +bpf_set_link_xdp_fd(int ifindex, int fd, uint32_t flags) +{ + + struct sockaddr_nl sa; + int sock, seq = 0, len, ret = -1; + char buf[4096]; + struct nlattr *nla, *nla_xdp; + struct { + struct nlmsghdr nh; + struct ifinfomsg ifinfo; + char attrbuf[64]; + } req; + struct nlmsghdr *nh; + struct nlmsgerr *err; + socklen_t addrlen; + int one = 1; + + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + return -errno; + } + + if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK, + &one, sizeof(one)) < 0) { + VLOG_WARN_RL(&rl, "Netlink error reporting not supported"); + } + + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + ret = -errno; + goto cleanup; + } + + addrlen = sizeof(sa); + if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) { + ret = -errno; + goto cleanup; + } + + if (addrlen != sizeof(sa)) { + goto cleanup; + } + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_SETLINK; + req.nh.nlmsg_pid = 0; + req.nh.nlmsg_seq = ++seq; + req.ifinfo.ifi_family = AF_UNSPEC; + req.ifinfo.ifi_index = ifindex; + + /* started nested attribute for XDP */ + nla = (struct nlattr *)(((char *)&req) + + NLMSG_ALIGN(req.nh.nlmsg_len)); + nla->nla_type = NLA_F_NESTED | IFLA_XDP; + nla->nla_len = NLA_HDRLEN; + + /* add XDP fd */ + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); + nla_xdp->nla_type = IFLA_XDP_FD; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); + memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); + nla->nla_len += nla_xdp->nla_len; + + /* if user passed in any flags, add those too */ + if (flags) { + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); + nla_xdp->nla_type = IFLA_XDP_FLAGS; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); + memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); + nla->nla_len += nla_xdp->nla_len; + } + + req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); + + /* send */ + if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { + ret = -errno; + goto cleanup; + } + + /* recv */ + len = recv(sock, buf, sizeof(buf), 0); + if (len < 0) { + ret = -errno; + goto cleanup; + } + + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); + nh = NLMSG_NEXT(nh, len)) { + if (nh->nlmsg_pid != sa.nl_pid) { + ret = -1; + goto cleanup; + } + if (nh->nlmsg_seq != seq) { + ret = -1; + goto cleanup; + } + switch (nh->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(nh); + if (!err->error) + continue; + ret = err->error; + /* nla_dump_errormsg(nh); */ + goto cleanup; + case NLMSG_DONE: + break; + default: + break; + } + } + + ret = 0; + +cleanup: + close(sock); + return ret; +} + +static int +netdev_linux_set_xdp__(struct netdev *netdev_, const struct bpf_prog *prog, + unsigned int valid_bit, int *filter_error, + uint32_t *netdev_filter) +{ + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + const char *netdev_name = netdev_get_name(netdev_); + int ifindex = netdev->ifindex; + int error; + + VLOG_DBG("Setting %s XDP filter %d on %s (ifindex %d)", prog->name, + prog->fd, netdev_name, ifindex); + + if (netdev->cache_valid & valid_bit) { + error = *filter_error; + if (error || (prog && prog->fd == *netdev_filter)) { + /* Assume that settings haven't changed since we last set them. */ + goto out; + } + netdev->cache_valid &= ~valid_bit; + } + error = bpf_set_link_xdp_fd(ifindex, prog->fd, XDP_FLAGS_SKB_MODE); + if (error < 0) { + VLOG_WARN_RL(&rl, "%s: adding XDP filter %s failed: %s", + netdev_name, prog->name, ovs_strerror(error)); + goto out; + } + +out: + if (!error || error == ENODEV) { + *filter_error = error; + netdev->cache_valid |= valid_bit; + } + return error; +} + +static int +netdev_linux_set_xdp(struct netdev *netdev_, const struct bpf_prog *prog) +{ + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + int error; + + ovs_mutex_lock(&netdev->mutex); + error = netdev_linux_set_xdp__(netdev_, prog, VALID_XDP_FILTER, + &netdev->ingress_xdp_error, + &netdev->ingress_xdp_filter); + ovs_mutex_unlock(&netdev->mutex); + + return error; +} + static int netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED, struct sset *types) @@ -2879,6 +3165,8 @@ netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off, NULL, /* get_pt_mode */ \ \ netdev_linux_set_policing, \ + netdev_linux_set_filter, \ + netdev_linux_set_xdp, \ netdev_linux_get_qos_types, \ netdev_linux_get_qos_capabilities, \ netdev_linux_get_qos, \ @@ -4671,6 +4959,74 @@ static const struct tc_ops tc_ops_other = { NULL /* class_dump_stats */ }; +/* "linux-clsact" traffic control class. */ +static int +clsact_setup_qdisc(struct netdev *netdev) +{ + struct ofpbuf request; + struct tcmsg *tcmsg; + + tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC, + NLM_F_EXCL | NLM_F_CREATE, &request); + if (!tcmsg) { + return ENODEV; + } + tcmsg->tcm_handle = tc_make_handle(0xFFFF, 0); + tcmsg->tcm_parent = TC_H_INGRESS; + nl_msg_put_string(&request, TCA_KIND, "clsact"); + nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0); + + return tc_transact(&request, NULL); +} + +static int +clsact_install__(struct netdev *netdev_) +{ + static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_clsact); + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + int error; + + error = clsact_setup_qdisc(netdev_); + if (error) { + return error; + } + + /* Nothing but a tc class implementation is allowed to write to a tc. This + * class never does that, so we can legitimately use a const tc object. */ + netdev->tc = CONST_CAST(struct tc *, &tc); + + return 0; +} + +static int +clsact_tc_install(struct netdev *netdev, + const struct smap *details OVS_UNUSED) +{ + return clsact_install__(netdev); +} + +static int +clsact_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED) +{ + return clsact_install__(netdev); +} + +static const struct tc_ops tc_ops_clsact = { + "clsact", /* linux_name */ + "linux-clsact", /* ovs_name */ + 0, /* n_queues */ + clsact_tc_install, + clsact_tc_load, + NULL, /* tc_destroy */ + NULL, /* qdisc_get */ + NULL, /* qdisc_set */ + NULL, /* class_get */ + NULL, /* class_set */ + NULL, /* class_delete */ + NULL, /* class_get_stats */ + NULL /* class_dump_stats */ +}; + /* Traffic control. */ /* Number of kernel "tc" ticks per second. */ @@ -4775,6 +5131,49 @@ tc_add_policer(struct netdev *netdev, return 0; } +/* Adds a filter to 'netdev' corresponding to BPF program associated with 'fd'. + * + * This function is equivalent to running: + * /sbin/tc filter add dev <devname> <parent> bpf da object-pinned <path> + * + * The configuration and stats may be seen with the following command: + * /sbin/tc -s filter show dev <devname> <parent> + * + * Returns 0 if successful, otherwise a positive errno value. + */ +static int +tc_add_filter(struct netdev *netdev, int fd, uint32_t parent, const char *name) +{ + struct ofpbuf request; + struct tcmsg *tcmsg; + size_t opts_offset; + int error; + + tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER, + NLM_F_EXCL | NLM_F_CREATE, &request); + if (!tcmsg) { + return ENODEV; + } + tcmsg->tcm_handle = tc_make_handle(0, 0x1); + tcmsg->tcm_parent = parent; + tcmsg->tcm_info = tc_make_handle(0, /* preference */ + (OVS_FORCE uint16_t) htons(ETH_P_ALL)); + + nl_msg_put_string(&request, TCA_KIND, "bpf"); + opts_offset = nl_msg_start_nested(&request, TCA_OPTIONS); + nl_msg_put_u32(&request, TCA_BPF_FLAGS, TCA_BPF_FLAG_ACT_DIRECT); + nl_msg_put_u32(&request, TCA_BPF_FD, fd); + nl_msg_put_string(&request, TCA_BPF_NAME, name); + nl_msg_end_nested(&request, opts_offset); + + error = tc_transact(&request, NULL); + if (error) { + return error; + } + + return 0; +} + static void read_psched(void) { @@ -5060,21 +5459,21 @@ tc_delete_class(const struct netdev *netdev, unsigned int handle) return error; } -/* Equivalent to "tc qdisc del dev <name> root". */ +/* Equivalent to "tc qdisc del dev <name> handle <handle> <parent>". */ static int -tc_del_qdisc(struct netdev *netdev_) +tc_del_qdisc__(struct netdev_linux *netdev, uint32_t parent, uint32_t handle) { - struct netdev_linux *netdev = netdev_linux_cast(netdev_); struct ofpbuf request; struct tcmsg *tcmsg; int error; - tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request); + tcmsg = netdev_linux_tc_make_request(&netdev->up, RTM_DELQDISC, 0, + &request); if (!tcmsg) { return ENODEV; } - tcmsg->tcm_handle = tc_make_handle(1, 0); - tcmsg->tcm_parent = TC_H_ROOT; + tcmsg->tcm_handle = handle; + tcmsg->tcm_parent = parent; error = tc_transact(&request, NULL); if (error == EINVAL) { @@ -5092,6 +5491,27 @@ tc_del_qdisc(struct netdev *netdev_) } static bool +tc_is_clsact(const struct tc *tc) +{ + if (!tc || !tc->ops->linux_name) { + return false; + } + return !strcmp(tc->ops->linux_name, "clsact"); +} + +static int +tc_del_qdisc(struct netdev *netdev_) +{ + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + + if (netdev->tc && tc_is_clsact(netdev->tc)) { + return tc_del_qdisc__(netdev, TC_H_INGRESS, + tc_make_handle(TC_H_INGRESS, 0)); + } + return tc_del_qdisc__(netdev, TC_H_ROOT, tc_make_handle(1, 0)); +} + +static bool getqdisc_is_safe(void) { static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; diff --git a/lib/netdev-linux.h b/lib/netdev-linux.h index 880f86402a1e..8257d4c695f9 100644 --- a/lib/netdev-linux.h +++ b/lib/netdev-linux.h @@ -29,6 +29,8 @@ int netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag, const char *flag_name, bool enable); int linux_get_ifindex(const char *netdev_name); +int bpf_set_link_xdp_fd(int ifindex, int fd, uint32_t flags); + #define LINUX_FLOW_OFFLOAD_API \ netdev_tc_flow_flush, \ netdev_tc_flow_dump_create, \ diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h index 25bd671c1382..3e53a5b76272 100644 --- a/lib/netdev-provider.h +++ b/lib/netdev-provider.h @@ -32,6 +32,7 @@ extern "C" { #endif +struct bpf_prog; struct netdev_tnl_build_header_params; #define NETDEV_NUMA_UNSPEC OVS_NUMA_UNSPEC @@ -505,6 +506,16 @@ struct netdev_class { int (*set_policing)(struct netdev *netdev, unsigned int kbits_rate, unsigned int kbits_burst); + /* Attempts to attach a traffic filter in the form of an (e)BPF program. + * + * This function may be set to null if filters are not supported. */ + int (*set_filter)(struct netdev *netdev, const struct bpf_prog *); + + /* Attempts to attach a XDP eBPF program. + * + * This function may be set to null if filters are not supported. */ + int (*set_xdp)(struct netdev *netdev, const struct bpf_prog *); + /* Adds to 'types' all of the forms of QoS supported by 'netdev', or leaves * it empty if 'netdev' does not support QoS. Any names added to 'types' * should be documented as valid for the "type" column in the "QoS" table diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index 52aa12d79933..4341c89894a3 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -22,12 +22,14 @@ #include <errno.h> #include <fcntl.h> #include <sys/socket.h> +#include <linux/rtnetlink.h> #include <net/if.h> #include <sys/types.h> #include <netinet/in.h> #include <netinet/ip6.h> #include <sys/ioctl.h> +#include "bpf.h" #include "byte-order.h" #include "daemon.h" #include "dirs.h" @@ -43,6 +45,7 @@ #include "route-table.h" #include "smap.h" #include "socket-util.h" +#include "tc.h" #include "unaligned.h" #include "unixctl.h" #include "openvswitch/vlog.h" @@ -72,6 +75,10 @@ struct vport_class { struct netdev_class netdev_class; }; +/* This is set pretty low because we probably won't learn anything from the + * additional log messages. */ +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); + bool netdev_vport_is_vport_class(const struct netdev_class *class) { @@ -866,6 +873,140 @@ netdev_vport_get_ifindex(const struct netdev *netdev_) return linux_get_ifindex(name); } +/* "linux-clsact" traffic control class. */ +static int +clsact_setup_qdisc(struct netdev *netdev) +{ + struct ofpbuf request; + struct tcmsg *tcmsg; + int ifindex; + + ifindex = netdev_vport_get_ifindex(netdev); + + tcmsg = tc_make_request(ifindex, RTM_NEWQDISC, NLM_F_EXCL | NLM_F_CREATE, + &request); + if (!tcmsg) { + return ENODEV; + } + tcmsg->tcm_handle = tc_make_handle(0xFFFF, 0); + tcmsg->tcm_parent = TC_H_INGRESS; + nl_msg_put_string(&request, TCA_KIND, "clsact"); + nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0); + + return tc_transact(&request, NULL); +} + +static int +tc_add_filter(struct netdev *netdev, int fd, uint32_t parent, const char *name) +{ + struct ofpbuf request; + struct tcmsg *tcmsg; + size_t opts_offset; + int ifindex; + int error; + + ifindex = netdev_vport_get_ifindex(netdev); + + tcmsg = tc_make_request(ifindex, RTM_NEWTFILTER, NLM_F_EXCL | NLM_F_CREATE, + &request); + if (!tcmsg) { + return ENODEV; + } + tcmsg->tcm_handle = tc_make_handle(0, 0x1); + tcmsg->tcm_parent = parent; +#define ETH_P_ALL 0x0003 + tcmsg->tcm_info = tc_make_handle(0, /* preference */ + (OVS_FORCE uint16_t) htons(ETH_P_ALL)); + + nl_msg_put_string(&request, TCA_KIND, "bpf"); + opts_offset = nl_msg_start_nested(&request, TCA_OPTIONS); + nl_msg_put_u32(&request, TCA_BPF_FLAGS, TCA_BPF_FLAG_ACT_DIRECT); + nl_msg_put_u32(&request, TCA_BPF_FD, fd); + nl_msg_put_string(&request, TCA_BPF_NAME, name); + nl_msg_end_nested(&request, opts_offset); + + error = tc_transact(&request, NULL); + if (error) { + return error; + } + + return 0; +} + +/* Attempts to set a BPF filter on the device. Returns 0 if successful, + * otherwise a positive errno value. */ +static int +netdev_vport_set_filter__(struct netdev *netdev_, const struct bpf_prog *prog, + unsigned int OVS_UNUSED valid_bit, int OVS_UNUSED *filter_error, + uint32_t OVS_UNUSED *netdev_filter) +{ + struct netdev_vport OVS_UNUSED *netdev = netdev_vport_cast(netdev_); + const char *netdev_name = netdev_get_name(netdev_); + int error; + + if (!prog) { + return 0; + } + + VLOG_DBG("Setting %s filter %d on %s (handle %08"PRIx32")", prog->name, + prog->fd, netdev_name, prog->handle); + + error = clsact_setup_qdisc(netdev_); + if (error && error != EEXIST) { + VLOG_WARN("%s: clsact qdisc setup failed: %s", + netdev_name, ovs_strerror(error)); + goto out; + } + + error = tc_add_filter(netdev_, prog->fd, prog->handle, prog->name); + if (error){ + VLOG_WARN_RL(&rl, "%s: adding filter %s failed: %s", + netdev_name, prog->name, ovs_strerror(error)); + goto out; + } + +out: + VLOG_INFO("%s %d", __func__, error); + return error; +} + +static int +netdev_vport_set_filter(struct netdev *netdev_, const struct bpf_prog *prog) +{ + struct netdev_vport *netdev = netdev_vport_cast(netdev_); + int error = 0; + + ovs_mutex_lock(&netdev->mutex); + if (!prog || prog->handle == INGRESS_HANDLE) { + error = netdev_vport_set_filter__(netdev_, prog, 0, NULL, NULL); + } + ovs_mutex_unlock(&netdev->mutex); + + VLOG_INFO("%s %d", __func__, error); + + return error; +} + +int bpf_set_link_xdp_fd(int ifindex, int fd, uint32_t flags); + +static int +netdev_vport_set_xdp(struct netdev *netdev_, const struct bpf_prog *prog) +{ + struct netdev_vport *netdev = netdev_vport_cast(netdev_); + int error = 0; + int ifindex; + + ovs_mutex_lock(&netdev->mutex); + ifindex = netdev_vport_get_ifindex(netdev_); + error = bpf_set_link_xdp_fd(ifindex, prog->fd, + XDP_FLAGS_SKB_MODE); + ovs_mutex_unlock(&netdev->mutex); + + VLOG_INFO("%s %d", __func__, error); + + return error; +} + #define NETDEV_VPORT_GET_IFINDEX netdev_vport_get_ifindex #define NETDEV_FLOW_OFFLOAD_API LINUX_FLOW_OFFLOAD_API #else /* !__linux__ */ @@ -914,6 +1055,8 @@ netdev_vport_get_ifindex(const struct netdev *netdev_) get_pt_mode, \ \ NULL, /* set_policing */ \ + netdev_vport_set_filter, /* set_filter */ \ + netdev_vport_set_xdp, /* set_xdp */ \ NULL, /* get_qos_types */ \ NULL, /* get_qos_capabilities */ \ NULL, /* get_qos */ \ @@ -972,7 +1115,7 @@ netdev_vport_tunnel_register(void) TUNNEL_CLASS("gre", "gre_sys", netdev_gre_build_header, netdev_gre_push_header, netdev_gre_pop_header, - NULL), + NETDEV_VPORT_GET_IFINDEX), TUNNEL_CLASS("vxlan", "vxlan_sys", netdev_vxlan_build_header, netdev_tnl_push_udp_header, netdev_vxlan_pop_header, diff --git a/lib/netdev.c b/lib/netdev.c index be05dc64024a..c44a1a683b92 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -759,6 +759,13 @@ netdev_get_pt_mode(const struct netdev *netdev) : NETDEV_PT_LEGACY_L2); } +/* Returns a 32-bit hash of the given port number. */ +uint32_t +netdev_hash_port_no(odp_port_t port_no) +{ + return hash_int(odp_to_u32(port_no), 0); +} + /* Sends 'batch' on 'netdev'. Returns 0 if successful (for every packet), * otherwise a positive errno value. Returns EAGAIN without blocking if * at least one the packets cannot be queued immediately. Returns EMSGSIZE @@ -1449,6 +1456,24 @@ netdev_set_policing(struct netdev *netdev, uint32_t kbits_rate, : EOPNOTSUPP); } +/* Attempts to apply (e)BPF filter 'prog' to the netdev. */ +int +netdev_set_filter(struct netdev *netdev, struct bpf_prog *prog) +{ + return (netdev->netdev_class->set_filter + ? netdev->netdev_class->set_filter(netdev, prog) + : EOPNOTSUPP); +} + +/* Attempts to apply (e)BPF filter 'prog' to the netdev. */ +int +netdev_set_xdp(struct netdev *netdev, struct bpf_prog *prog) +{ + return (netdev->netdev_class->set_xdp + ? netdev->netdev_class->set_xdp(netdev, prog) + : EOPNOTSUPP); +} + /* Adds to 'types' all of the forms of QoS supported by 'netdev', or leaves it * empty if 'netdev' does not support QoS. Any names added to 'types' should * be documented as valid for the "type" column in the "QoS" table in diff --git a/lib/netdev.h b/lib/netdev.h index ff1b604b24e2..3388504d85c9 100644 --- a/lib/netdev.h +++ b/lib/netdev.h @@ -59,6 +59,7 @@ extern "C" { * netdev and access each of those from a different thread.) */ +struct bpf_prog; struct dp_packet_batch; struct dp_packet; struct netdev_class; @@ -167,6 +168,7 @@ bool netdev_mtu_is_user_config(struct netdev *); int netdev_get_ifindex(const struct netdev *); int netdev_set_tx_multiq(struct netdev *, unsigned int n_txq); enum netdev_pt_mode netdev_get_pt_mode(const struct netdev *); +uint32_t netdev_hash_port_no(odp_port_t port_no); /* Packet reception. */ int netdev_rxq_open(struct netdev *, struct netdev_rxq **, int id); @@ -316,6 +318,8 @@ struct netdev_queue_stats { int netdev_set_policing(struct netdev *, uint32_t kbits_rate, uint32_t kbits_burst); +int netdev_set_filter(struct netdev *netdev, struct bpf_prog *prog); +int netdev_set_xdp(struct netdev *netdev, struct bpf_prog *prog); int netdev_get_qos_types(const struct netdev *, struct sset *types); int netdev_get_qos_capabilities(const struct netdev *, -- 2.7.4 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
