On 04/25/2018 08:34 PM, David Ahern wrote:
> Provide a helper for doing a FIB and neighbor lookup in the kernel
> tables from an XDP program. The helper provides a fastpath for forwarding
> packets. If the packet is a local delivery or for any reason is not a
> simple lookup and forward, the packet continues up the stack.
> 
> If it is to be forwarded, the forwarding can be done directly if the
> neighbor is already known. If the neighbor does not exist, the first
> few packets go up the stack for neighbor resolution. Once resolved, the
> xdp program provides the fast path.
> 
> On successful lookup the nexthop dmac, current device smac and egress
> device index are returned.
> 
> The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
> are implemented in this patch. The API includes layer 4 parameters if
> the XDP program chooses to do deep packet inspection to allow compare
> against ACLs implemented as FIB rules.
> 
> Header rewrite is left to the XDP program.
> 
> The lookup takes 2 flags:
> - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
>   straight to the table associated with the device (expert setting for
>   those looking to maximize throughput)
> 
> - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
>   Default is an ingress lookup.
> 
> Initial performance numbers collected by Jesper, forwarded packets/sec:
> 
>        Full stack    XDP FIB lookup    XDP Direct lookup
> IPv4   1,947,969       7,074,156          7,415,333
> IPv6   1,728,000       6,165,504          7,262,720
> 
> 
> Signed-off-by: David Ahern <dsah...@gmail.com>
> ---
>  include/uapi/linux/bpf.h |  68 +++++++++++++-
>  net/core/filter.c        | 233 
> +++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 300 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e6679393b687..82601c132b9f 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -10,6 +10,8 @@
>  
>  #include <linux/types.h>
>  #include <linux/bpf_common.h>
> +#include <linux/if_ether.h>
> +#include <linux/in6.h>
>  
>  /* Extended instruction set based on top of classic BPF */
>  
> @@ -783,6 +785,17 @@ union bpf_attr {
>   *     @size: size of 'struct bpf_xfrm_state'
>   *     @flags: room for future extensions
>   *     Return: 0 on success or negative error
> + *
> + * int bpf_fib_lookup(ctx, params, plen, flags)
> + *     Do a FIB lookup based on given parameters
> + *     @ctx:     pointer to context of type xdp_md

Nit: would just say pointer to context here since used with xdp/skb

> + *     @params:  pointer to bpf_fib_lookup
> + *     @plen:    size of params argument
> + *     @flags:   u32 bitmask of BPF_FIB_LOOKUP_* flags
> + *     Return: egress device index if packet is to be forwarded,
> + *             0 for local delivery (anything that needs to be handled
> + *             by the full stack), or negative on error.
> + *             If index is > 0, output data in bpf_fib_lookup is set
>   */
>  #define __BPF_FUNC_MAPPER(FN)                \
>       FN(unspec),                     \
> @@ -851,7 +864,9 @@ union bpf_attr {
>       FN(msg_pull_data),              \
>       FN(bind),                       \
>       FN(xdp_adjust_tail),            \
> -     FN(skb_get_xfrm_state),
> +     FN(skb_get_xfrm_state),         \
> +     FN(fib_lookup),                 \
> +
>  

Nit: trailing '\' resp. double newline

>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
[...]

> diff --git a/net/core/filter.c b/net/core/filter.c
> index 8e45c6c7ab08..37602b2fb94a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -59,6 +59,10 @@
>  #include <net/tcp.h>
>  #include <net/xfrm.h>
>  #include <linux/bpf_trace.h>
> +#include <linux/inetdevice.h>
> +#include <net/ip_fib.h>
> +#include <net/flow.h>
> +#include <net/arp.h>
>  
>  /**
>   *   sk_filter_trim_cap - run a packet through a socket filter
> @@ -3787,6 +3791,231 @@ static const struct bpf_func_proto 
> bpf_skb_get_xfrm_state_proto = {
>  };
>  #endif
>  
> +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
> +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
> +                               const struct neighbour *neigh,
> +                               const struct net_device *dev)
> +{
> +     memcpy(params->dmac, neigh->ha, ETH_ALEN);
> +     memcpy(params->smac, dev->dev_addr, ETH_ALEN);
> +     params->h_vlan_TCI = 0;
> +     params->h_vlan_proto = 0;
> +
> +     return dev->ifindex;
> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_INET)
> +static int bpf_ipv4_fib_lookup(struct xdp_buff *ctx,

Instead of passing xdp_buff here, just pass the netdev pointer. More below
why it's needed.

> +                            struct bpf_fib_lookup *params, u32 flags)
> +{
> +     struct net *net = dev_net(ctx->rxq->dev);
> +     struct in_device *in_dev;
> +     struct neighbour *neigh;
> +     struct net_device *dev;
> +     struct fib_result res;
> +     struct fib_nh *nh;
> +     struct flowi4 fl4;
> +     int err;
> +
> +     dev = dev_get_by_index_rcu(net, params->ifindex);
> +     if (unlikely(!dev))
> +             return -ENODEV;
> +
> +     /* verify forwarding is enabled on this interface */
> +     in_dev = __in_dev_get_rcu(dev);
> +     if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
> +             return 0;
> +
> +     if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> +             fl4.flowi4_iif = 1;
> +             fl4.flowi4_oif = params->ifindex;
> +     } else {
> +             fl4.flowi4_iif = params->ifindex;
> +             fl4.flowi4_oif = 0;
> +     }
> +     fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
> +     fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
> +     fl4.flowi4_flags = 0;
> +
> +     fl4.flowi4_proto = params->l4_protocol;
> +     fl4.daddr = params->ipv4_dst;
> +     fl4.saddr = params->ipv4_src;
> +     fl4.fl4_sport = params->sport;
> +     fl4.fl4_dport = params->dport;
> +
> +     if (flags & BPF_FIB_LOOKUP_DIRECT) {
> +             u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
> +             struct fib_table *tb;
> +
> +             tb = fib_get_table(net, tbid);
> +             if (unlikely(!tb))
> +                     return 0;
> +
> +             err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
> +     } else {
> +             fl4.flowi4_mark = 0;
> +             fl4.flowi4_secid = 0;
> +             fl4.flowi4_tun_key.tun_id = 0;
> +             fl4.flowi4_uid = sock_net_uid(net, NULL);
> +
> +             err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
> +     }
> +
> +     if (err || res.type != RTN_UNICAST)
> +             return 0;
> +
> +     if (res.fi->fib_nhs > 1)
> +             fib_select_path(net, &res, &fl4, NULL);
> +
> +     nh = &res.fi->fib_nh[res.nh_sel];
> +
> +     /* do not handle lwt encaps right now */
> +     if (nh->nh_lwtstate)
> +             return 0;
> +
> +     dev = nh->nh_dev;
> +     if (unlikely(!dev))
> +             return 0;
> +
> +     if (nh->nh_gw)
> +             params->ipv4_dst = nh->nh_gw;
> +
> +     params->rt_metric = res.fi->fib_priority;
> +
> +     /* xdp and cls_bpf programs are run in RCU-bh so
> +      * rcu_read_lock_bh is not needed here
> +      */
> +     neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
> +     if (neigh)
> +             return bpf_fib_set_fwd_params(params, neigh, dev);
> +
> +     return 0;
> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> +static int bpf_ipv6_fib_lookup(struct xdp_buff *ctx,

Same here.

> +                            struct bpf_fib_lookup *params, u32 flags)
> +{
> +     struct net *net = dev_net(ctx->rxq->dev);
> +     struct neighbour *neigh;
> +     struct net_device *dev;
> +     struct fib6_info *f6i;
> +     struct flowi6 fl6;
> +     int strict = 0;
> +     int oif;
> +
> +     /* link local addresses are never forwarded */
> +     if (rt6_need_strict(&params->ipv6_dst) ||
> +         rt6_need_strict(&params->ipv6_src))
> +             return 0;
> +
> +     dev = dev_get_by_index_rcu(net, params->ifindex);
> +     if (unlikely(!dev))
> +             return -ENODEV;
> +
> +     if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> +             fl6.flowi6_iif = 1;
> +             oif = fl6.flowi6_oif = params->ifindex;
> +     } else {
> +             oif = fl6.flowi6_iif = params->ifindex;
> +             fl6.flowi6_oif = 0;
> +             strict = RT6_LOOKUP_F_HAS_SADDR;
> +     }
> +     fl6.flowlabel = params->flowlabel;
> +     fl6.flowi6_scope = 0;
> +     fl6.flowi6_flags = 0;
> +     fl6.mp_hash = 0;
> +
> +     fl6.flowi6_proto = params->l4_protocol;
> +     fl6.daddr = params->ipv6_dst;
> +     fl6.saddr = params->ipv6_src;
> +     fl6.fl6_sport = params->sport;
> +     fl6.fl6_dport = params->dport;
> +
> +     if (flags & BPF_FIB_LOOKUP_DIRECT) {
> +             u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
> +             struct fib6_table *tb;
> +
> +             tb = ipv6_stub->fib6_get_table(net, tbid);
> +             if (unlikely(!tb))
> +                     return 0;
> +
> +             f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
> +     } else {
> +             fl6.flowi6_mark = 0;
> +             fl6.flowi6_secid = 0;
> +             fl6.flowi6_tun_key.tun_id = 0;
> +             fl6.flowi6_uid = sock_net_uid(net, NULL);
> +
> +             f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
> +     }
> +
> +     if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
> +             return 0;
> +
> +     if (unlikely(f6i->fib6_flags & RTF_REJECT ||
> +         f6i->fib6_type != RTN_UNICAST))
> +             return 0;
> +
> +     if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
> +             f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
> +                                                    fl6.flowi6_oif, NULL,
> +                                                    strict);
> +
> +     if (f6i->fib6_nh.nh_lwtstate)
> +             return 0;
> +
> +     if (f6i->fib6_flags & RTF_GATEWAY)
> +             params->ipv6_dst = f6i->fib6_nh.nh_gw;
> +
> +     dev = f6i->fib6_nh.nh_dev;
> +     params->rt_metric = f6i->fib6_metric;
> +
> +     /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
> +      * not needed here. Can not use __ipv6_neigh_lookup_noref here
> +      * because we need to get nd_tbl via the stub
> +      */
> +     neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
> +                                   ndisc_hashfn, &params->ipv6_dst, dev);
> +     if (neigh)
> +             return bpf_fib_set_fwd_params(params, neigh, dev);
> +
> +     return 0;
> +}
> +#endif
> +
> +BPF_CALL_4(bpf_fib_lookup, struct xdp_buff *, ctx,
> +        struct bpf_fib_lookup *, params, int, plen, u32, flags)
> +{
> +     if (plen < sizeof(*params))
> +             return -EINVAL;
> +
> +     switch (params->family) {
> +#if IS_ENABLED(CONFIG_INET)
> +     case AF_INET:
> +             return bpf_ipv4_fib_lookup(ctx, params, flags);
> +#endif
> +#if IS_ENABLED(CONFIG_IPV6)
> +     case AF_INET6:
> +             return bpf_ipv6_fib_lookup(ctx, params, flags);
> +#endif
> +     }
> +     return -ENOTSUPP;
> +}
> +
> +static const struct bpf_func_proto bpf_fib_lookup_proto = {
> +     .func           = bpf_fib_lookup,
> +     .gpl_only       = true,
> +     .pkt_access     = true,
> +     .ret_type       = RET_INTEGER,
> +     .arg1_type      = ARG_PTR_TO_CTX,
> +     .arg2_type      = ARG_PTR_TO_MEM,
> +     .arg3_type      = ARG_CONST_SIZE,
> +     .arg4_type      = ARG_ANYTHING,
> +};
> +
>  static const struct bpf_func_proto *
>  bpf_base_func_proto(enum bpf_func_id func_id)
>  {
> @@ -3861,6 +4090,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const 
> struct bpf_prog *prog)
>               return &bpf_get_socket_cookie_proto;
>       case BPF_FUNC_get_socket_uid:
>               return &bpf_get_socket_uid_proto;
> +     case BPF_FUNC_fib_lookup:
> +             return &bpf_fib_lookup_proto;

This part doesn't belong to sk_filter_func_proto(), but to the
tc_cls_act_func_proto() instead.

>       default:
>               return bpf_base_func_proto(func_id);
>       }
> @@ -3957,6 +4188,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct 
> bpf_prog *prog)
>               return &bpf_xdp_redirect_map_proto;
>       case BPF_FUNC_xdp_adjust_tail:
>               return &bpf_xdp_adjust_tail_proto;
> +     case BPF_FUNC_fib_lookup:
> +             return &bpf_fib_lookup_proto;

Basically, you're using the very same bpf_fib_lookup_proto for
both XDP and skb. In the skb case, you're reusing the two functions
bpf_ipv{4,6}_fib_lookup(), so when you get the netdev pointer for
retrieving the netns, you'll crash at dev_net(ctx->rxq->dev) since
this is XDP only and not skb meta data.

Therefore, as mentioned, pass the netdev to bpf_ipv{4,6}_fib_lookup()
to have it generic and have bpf_xdp_fib_lookup_proto and
bpf_skb_fib_lookup_proto where both are under the case BPF_FUNC_fib_lookup
in the respective *func_proto(), but using the proper prototypes according
to their correct context. Meaning, both reuse bpf_ipv{4,6}_fib_lookup()
from each of their BPF_CALL_4() helper implementation.

>       default:
>               return bpf_base_func_proto(func_id);
>       }
> 

Reply via email to