On Tue, Apr 21, 2020 at 10:48 PM Toshiaki Makita <[email protected]> wrote: > > This adds a reference program, flowtable_afxdp.o, which can be used to > offload flows to XDP through netdev-offload-xdp. > The program will be compiled when using --enable-bpf switch. Hi Toshiaki Good! One question, did we test the performance with this patch ? > Signed-off-by: Toshiaki Makita <[email protected]> > --- > Makefile.am | 9 +- > acinclude.m4 | 56 +++++ > bpf/.gitignore | 4 + > bpf/Makefile.am | 59 +++++ > bpf/bpf_miniflow.h | 179 +++++++++++++++ > bpf/bpf_netlink.h | 34 +++ > bpf/bpf_workaround.h | 28 +++ > bpf/flowtable_afxdp.c | 515 ++++++++++++++++++++++++++++++++++++++++++ > configure.ac | 2 + > 9 files changed, 884 insertions(+), 2 deletions(-) > create mode 100644 bpf/.gitignore > create mode 100644 bpf/Makefile.am > create mode 100644 bpf/bpf_miniflow.h > create mode 100644 bpf/bpf_netlink.h > create mode 100644 bpf/bpf_workaround.h > create mode 100644 bpf/flowtable_afxdp.c > > diff --git a/Makefile.am b/Makefile.am > index b279303d1..f18bfefde 100644 > --- a/Makefile.am > +++ b/Makefile.am > @@ -8,6 +8,9 @@ > AUTOMAKE_OPTIONS = foreign subdir-objects > ACLOCAL_AMFLAGS = -I m4 > SUBDIRS = datapath > +if HAVE_BPF > +SUBDIRS += bpf > +endif > > AM_CPPFLAGS = $(SSL_CFLAGS) > AM_LDFLAGS = $(SSL_LDFLAGS) > @@ -198,7 +201,9 @@ ALL_LOCAL += dist-hook-git > dist-hook-git: distfiles > @if test -e $(srcdir)/.git && (git --version) >/dev/null 2>&1; then \ > (cd datapath && $(MAKE) distfiles); \ > - (cat distfiles; sed 's|^|datapath/|' datapath/distfiles) | \ > + (cd bpf && $(MAKE) distfiles); \ > + (cat distfiles; sed 's|^|datapath/|' datapath/distfiles; \ > + sed 's|^|bpf/|' bpf/distfiles) | \ > LC_ALL=C sort -u > all-distfiles; \ > (cd $(srcdir) && git ls-files) | grep -v '\.gitignore$$' | \ > grep -v '\.gitattributes$$' | \ > @@ -234,7 +239,7 @@ config-h-check: > @cd $(srcdir); \ > if test -e .git && (git --version) >/dev/null 2>&1 && \ > git --no-pager grep -L '#include <config\.h>' `git ls-files | grep > '\.c$$' | \ > - grep -vE > '^datapath|^lib/sflow|^third-party|^datapath-windows|^python'`; \ > + grep -vE > '^datapath|^lib/sflow|^third-party|^datapath-windows|^python|^bpf'`; \ > then \ > echo "See above for list of violations of the rule that"; \ > echo "every C source file must #include <config.h>."; \ > diff --git a/acinclude.m4 b/acinclude.m4 > index 0901f2870..2fb2f385f 100644 > --- a/acinclude.m4 > +++ b/acinclude.m4 > @@ -301,6 +301,62 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [ > AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true) > ]) > > +dnl OVS_CHECK_LINUX_BPF > +dnl > +dnl Check both llvm and libbpf support > +AC_DEFUN([OVS_CHECK_LINUX_BPF], [ > + AC_ARG_ENABLE([bpf], > + [AC_HELP_STRING([--enable-bpf], > + [Compile reference eBPF programs for XDP])], > + [], [enable_bpf=no]) > + AC_MSG_CHECKING([whether BPF is enabled]) > + if test "$enable_bpf" != yes; then > + AC_MSG_RESULT([no]) > + BPF_ENABLE=false > + else > + AC_MSG_RESULT([yes]) > + BPF_ENABLE=true > + > + AC_CHECK_PROG(CLANG_CHECK, clang, yes) > + AS_IF([test X"$CLANG_CHECK" != X"yes"], > + [AC_MSG_ERROR([unable to find clang to compile BPF program])]) > + > + AC_CHECK_PROG(LLC_CHECK, llc, yes) > + AS_IF([test X"$LLC_CHECK" != X"yes"], > + [AC_MSG_ERROR([unable to find llc to compile BPF program])]) > + > + AC_CHECK_HEADER([bpf/bpf_helpers.h], [], > + [AC_MSG_ERROR([unable to find bpf/bpf_helpers.h to compile BPF > program])]) > + > + AC_CHECK_HEADER([linux/bpf.h], [], > + [AC_MSG_ERROR([unable to find linux/bpf.h to compile BPF program])]) > + > + AC_MSG_CHECKING([for LLVM bpf target support]) > + if llc -march=bpf -mattr=help >/dev/null 2>&1; then > + AC_MSG_RESULT([yes]) > + else > + AC_MSG_RESULT([no]) > + AC_MSG_ERROR([LLVM does not support bpf target]) > + fi > + > + AC_MSG_CHECKING([for BTF DATASEC support]) > + AC_LANG_CONFTEST( > + [AC_LANG_SOURCE([__attribute__((section("_x"), used)) int x;])]) > + if clang -g -O2 -S -target bpf -emit-llvm -c conftest.c -o conftest.ll > && \ > + llc -march=bpf -filetype=obj -o conftest.o conftest.ll && \ > + readelf -p.BTF conftest.o 2>/dev/null | grep -q -w _x; then > + AC_MSG_RESULT([yes]) > + else > + AC_MSG_RESULT([no]) > + AC_MSG_ERROR([LLVM does not support BTF DATASEC]) > + fi > + > + AC_DEFINE([HAVE_BPF], [1], > + [Define to 1 if BPF compilation is available and enabled.]) > + fi > + AM_CONDITIONAL([HAVE_BPF], test "$BPF_ENABLE" = true) > +]) > + > dnl OVS_CHECK_DPDK > dnl > dnl Configure DPDK source tree > diff --git a/bpf/.gitignore b/bpf/.gitignore > new file mode 100644 > index 000000000..ab0f2d9e4 > --- /dev/null > +++ b/bpf/.gitignore > @@ -0,0 +1,4 @@ > +*.ll > +/distfiles > +/Makefile > +/Makefile.in > diff --git a/bpf/Makefile.am b/bpf/Makefile.am > new file mode 100644 > index 000000000..f485b17f0 > --- /dev/null > +++ b/bpf/Makefile.am > @@ -0,0 +1,59 @@ > +AUTOMAKE_OPTIONS = foreign > + > +EXTRA_DIST = flowtable_afxdp.c bpf_miniflow.h bpf_netlink.h bpf_workaround.h > + > +# The following is based on commands for the Automake "distdir" target. > +distfiles: Makefile > + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ > + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ > + list='$(DISTFILES)'; \ > + for file in $$list; do echo $$file; done | \ > + sed -e "s|^$$srcdirstrip/||;t" \ > + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t" | \ > + LC_ALL=C sort -u > $@ > +CLEANFILES = distfiles > + > +CLANG ?= clang > +LLC ?= llc > + > +AM_CPPFLAGS = -I $(top_srcdir)/include > +AM_CPPFLAGS += -I $(top_builddir)/include > +AM_CPPFLAGS += -I $(top_srcdir)/lib > +AM_CPPFLAGS += -I $(top_builddir)/lib > + > +AM_CFLAGS = -Wall > +AM_CFLAGS += -Wextra > +AM_CFLAGS += -Wno-sign-compare > +AM_CFLAGS += -Wformat -Wformat-security > +AM_CFLAGS += -Wswitch-enum > +AM_CFLAGS += -Wunused-parameter > +AM_CFLAGS += -Wbad-function-cast > +AM_CFLAGS += -Wcast-align > +AM_CFLAGS += -Wstrict-prototypes > +AM_CFLAGS += -Wold-style-definition > +AM_CFLAGS += -Wmissing-field-initializers > +AM_CFLAGS += -fno-strict-aliasing > +AM_CFLAGS += -Wswitch-bool > +AM_CFLAGS += -Wlogical-not-parentheses > +AM_CFLAGS += -Wsizeof-array-argument > +AM_CFLAGS += -Wshift-negative-value > +AM_CFLAGS += -Wshadow > +AM_CFLAGS += -Wcast-align > +AM_CFLAGS += -Wno-unused-value > +AM_CFLAGS += -Wno-compare-distinct-pointer-types > +AM_CFLAGS += -g > +AM_CFLAGS += -O2 > + > +SUFFIXES = .ll > +%.ll: %.c > + $(CLANG) $(AM_CPPFLAGS) $(AM_CFLAGS) -S \ > + -target bpf -emit-llvm -c $< -o $@ > +CLEANFILES += *.ll > + > +%.o: %.ll > + $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ $< > +CLEANFILES += *.o > + > +flowtable_afxdp.o: flowtable_afxdp.c bpf_miniflow.h bpf_netlink.h > bpf_workaround.h > + > +all-local: flowtable_afxdp.o > diff --git a/bpf/bpf_miniflow.h b/bpf/bpf_miniflow.h > new file mode 100644 > index 000000000..090a9c7be > --- /dev/null > +++ b/bpf/bpf_miniflow.h > @@ -0,0 +1,179 @@ > +/* > + * Copyright (c) 2020 NTT Corp. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#ifndef BPF_MINIFLOW_H > +#define BPF_MINIFLOW_H 1 > + > +#include "flow.h" > + > +struct bpf_mf_ctx { > + struct flowmap map; > + uint64_t *data; > +}; > + > +static inline void > +miniflow_set_maps(struct bpf_mf_ctx *ctx, size_t ofs, size_t n_words) > +{ > + flowmap_set(&ctx->map, ofs, n_words); > +} > + > +static inline void > +miniflow_set_map(struct bpf_mf_ctx *ctx, size_t ofs) > +{ > + flowmap_set(&ctx->map, ofs, 1); > +} > + > +static inline void > +miniflow_push_uint8_(struct bpf_mf_ctx *ctx, size_t ofs, uint8_t value) > +{ > + size_t ofs8 = ofs % 8; > + > + if (ofs8 == 0) { > + miniflow_set_map(ctx, ofs / 8); > + } > + *((uint8_t *)ctx->data + ofs8) = value; > + if (ofs8 == 7) { > + ctx->data++; > + } > +} > + > +static inline void > +miniflow_push_uint16_(struct bpf_mf_ctx *ctx, size_t ofs, uint16_t value) > +{ > + size_t ofs8 = ofs % 8; > + > + if (ofs8 == 0) { > + miniflow_set_map(ctx, ofs / 8); > + *(uint16_t *)ctx->data = value; > + } else if (ofs8 == 2) { > + *((uint16_t *)ctx->data + 1) = value; > + } else if (ofs8 == 4) { > + *((uint16_t *)ctx->data + 2) = value; > + } else if (ofs8 == 6) { > + *((uint16_t *)ctx->data + 3) = value; > + ctx->data++; > + } > +} > + > +static inline void > +miniflow_push_uint32_(struct bpf_mf_ctx *ctx, size_t ofs, uint32_t value) > +{ > + size_t ofs8 = ofs % 8; > + > + if (ofs8 == 0) { > + miniflow_set_map(ctx, ofs / 8); > + *(uint32_t *)ctx->data = value; > + } else if (ofs8 == 4) { > + *((uint32_t *)ctx->data + 1) = value; > + ctx->data++; > + } > +} > + > +static inline void > +ether_addr_copy(struct eth_addr *dst, const struct eth_addr *src) > +{ > + ovs_be16 *a = dst->be16; > + const ovs_be16 *b = src->be16; > + > + a[0] = b[0]; > + a[1] = b[1]; > + a[2] = b[2]; > +} > + > +/* 'valuep' is 16-aligned */ > +/* data must start 64-aligned and must be followed by other data or padding > */ > +static inline void > +miniflow_push_macs_(struct bpf_mf_ctx *ctx, size_t ofs, > + const struct eth_addr *valuep) > +{ > + miniflow_set_maps(ctx, ofs / 8, 2); > + ether_addr_copy((struct eth_addr *)ctx->data, valuep); > + ether_addr_copy((struct eth_addr *)ctx->data + 1, valuep + 1); > + ctx->data++; /* First word only. */ > +} > + > +/* data must start 64-aligned and must be followed by other data */ > +static inline void > +miniflow_pad_from_64_(struct bpf_mf_ctx *ctx, size_t ofs) > +{ > + size_t ofs8 = ofs % 8; > + size_t ofs4 = ofs % 4; > + size_t ofs2 = ofs % 2; > + void *cdata = ctx->data; > + > + miniflow_set_map(ctx, ofs / 8); > + > + if (ofs8 >= 4) { > + *(uint32_t *)cdata = 0; > + cdata += 4; > + } > + if (ofs4 >= 2) { > + *(uint16_t *)cdata = 0; > + cdata += 2; > + } > + if (ofs2 == 1) { > + *(uint8_t *)cdata = 0; > + } > +} > + > +static inline void > +miniflow_pad_to_64_(struct bpf_mf_ctx *ctx, size_t ofs) > +{ > + size_t ofs8 = ofs % 8; > + size_t ofs4 = ofs % 4; > + size_t ofs2 = ofs % 2; > + void *cdata = ctx->data; > + > + cdata += ofs8; > + if (ofs2 == 1) { > + *(uint8_t *)cdata = 0; > + cdata++; > + } > + if (ofs4 <= 2) { > + *(uint16_t *)cdata = 0; > + cdata += 2; > + } > + if (ofs8 <= 4) { > + *(uint32_t *)cdata = 0; > + } > + ctx->data++; > +} > + > +#define miniflow_push_uint8(CTX, FIELD, VALUE) \ > + miniflow_push_uint8_(CTX, offsetof(struct flow, FIELD), VALUE) > + > +#define miniflow_push_be16_(CTX, OFS, VALUE) \ > + miniflow_push_uint16_(CTX, OFS, (OVS_FORCE uint16_t)VALUE) > + > +#define miniflow_push_be16(CTX, FIELD, VALUE) \ > + miniflow_push_be16_(CTX, offsetof(struct flow, FIELD), VALUE) \ > + > +#define miniflow_push_be32_(CTX, OFS, VALUE) \ > + miniflow_push_uint32_(CTX, OFS, (OVS_FORCE uint32_t)VALUE) > + > +#define miniflow_push_be32(CTX, FIELD, VALUE) \ > + miniflow_push_be32_(CTX, offsetof(struct flow, FIELD), VALUE) \ > + > +#define miniflow_push_macs(CTX, FIELD, VALUEP) \ > + miniflow_push_macs_(CTX, offsetof(struct flow, FIELD), VALUEP) > + > +#define miniflow_pad_from_64(CTX, FIELD) \ > + miniflow_pad_from_64_(CTX, offsetof(struct flow, FIELD)) > + > +#define miniflow_pad_to_64(CTX, FIELD) \ > + miniflow_pad_to_64_(CTX, OFFSETOFEND(struct flow, FIELD)) > + > +#endif /* bpf_miniflow.h */ > diff --git a/bpf/bpf_netlink.h b/bpf/bpf_netlink.h > new file mode 100644 > index 000000000..091926c61 > --- /dev/null > +++ b/bpf/bpf_netlink.h > @@ -0,0 +1,34 @@ > +/* > + * Copyright (c) 2020 NTT Corp. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#ifndef BPF_NETLINK_H > +#define BPF_NETLINK_H 1 > + > +#include "netlink.h" > + > +static inline int > +bpf_nl_attr_type(const struct nlattr *nla) > +{ > + return nla->nla_type & NLA_TYPE_MASK; > +} > + > +static inline const void * > +bpf_nl_attr_get(const struct nlattr *nla) > +{ > + return nla + 1; > +} > + > +#endif /* bpf_netlink.h */ > diff --git a/bpf/bpf_workaround.h b/bpf/bpf_workaround.h > new file mode 100644 > index 000000000..cb072a7e6 > --- /dev/null > +++ b/bpf/bpf_workaround.h > @@ -0,0 +1,28 @@ > +/* > + * Copyright (c) 2020 NTT Corp. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#ifndef BPF_WORKAROUND_H > +#define BPF_WORKAROUND_H > + > +/* On Linux x86/x64 systems bits/wordsize.h included from stdint.h cannot > + * correctly determine __WORDSIZE for bpf, which causes incorrect UINTPTR_MAX > + */ > +#if __UINTPTR_MAX__ == __UINT64_MAX__ && defined(UINTPTR_MAX) > +#undef UINTPTR_MAX > +#define UINTPTR_MAX UINT64_MAX > +#endif > + > +#endif /* bpf_workaround.h */ > diff --git a/bpf/flowtable_afxdp.c b/bpf/flowtable_afxdp.c > new file mode 100644 > index 000000000..7a4767333 > --- /dev/null > +++ b/bpf/flowtable_afxdp.c > @@ -0,0 +1,515 @@ > +/* > + * Copyright (c) 2020 NTT Corp. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +/* linux/types.h is necessary for bpf_helpers.h as it's not self-contained */ > +#include <linux/types.h> > +#include <bpf/bpf_helpers.h> > +#include <linux/bpf.h> > + > +/* Workaround for incorrect macros for bpf in stdint.h */ > +#include <stdint.h> > +#include "bpf_workaround.h" > + > +#include "bpf_miniflow.h" > +#include "bpf_netlink.h" > +#include "netdev-offload-xdp.h" > +#include "packets.h" > +#include "util.h" > + > +/* Supported keys. Need to keep same 64-align as struct flow for miniflow */ > +struct xdp_flow { > + struct eth_addr dl_dst; > + struct eth_addr dl_src; > + ovs_be16 dl_type; > + uint8_t pad1[2]; > + > + union flow_vlan_hdr vlans[1]; > + uint8_t pad2[4]; > + > + ovs_be32 nw_src; > + ovs_be32 nw_dst; > + > + uint8_t pad3[4]; > + uint8_t nw_frag; > + uint8_t nw_tos; > + uint8_t nw_ttl; > + uint8_t nw_proto; > + > + ovs_be16 tp_src; > + ovs_be16 tp_dst; > + uint8_t pad4[4]; > +}; > + > +/* Size of xdp_flow must be 64-aligned for key comparison */ > +BUILD_ASSERT_DECL(sizeof(struct xdp_flow) % sizeof(uint64_t) == 0); > + > +#define XDP_FLOW_U64S (sizeof(struct xdp_flow) / sizeof(uint64_t)) > + > +#define XDP_MAX_SUBTABLE_FLOWS 1024 > +#define XDP_MAX_ACTIONS_LEN 256 > + > +/* Actual key in each subtable. miniflow map is omitted as it's identical to > + * mask map */ > +struct xdp_flow_key { > + union { > + uint64_t miniflow_buf[XDP_FLOW_U64S]; > + struct xdp_flow _flow; /* Need this to keep xdp_flow in BTF */ > + }; > +}; > + > +/* Value for subtable mask array */ > +struct xdp_subtable_mask { > + struct xdp_subtable_mask_header header; > + uint64_t buf[XDP_FLOW_U64S]; > +}; > + > +/* miniflow for packet */ > +struct xdp_miniflow { > + struct miniflow mf; > + struct xdp_flow_key value; > +}; > + > +/* Used when the action only modifies the packet */ > +#define _XDP_ACTION_CONTINUE -1 > + > +/* Supported actions */ > +/* XXX: This size should be uint16_t but needs to be int as kernel has a bug > + * in btf_enum_check_member() that assumes enum size is sizeof(int), which > + * causes an error when loading BTF if we use uint16_t here */ > +enum action_attrs : uint32_t { > + XDP_ACTION_OUTPUT = OVS_ACTION_ATTR_OUTPUT, > + XDP_ACTION_PUSH_VLAN = OVS_ACTION_ATTR_PUSH_VLAN, > + XDP_ACTION_POP_VLAN = OVS_ACTION_ATTR_POP_VLAN, > +}; > + > +/* Identical to struct nlattr. Need this to keep enum action_attrs in BTF */ > +struct xdp_action_nlattr { > + uint16_t nla_len; > + enum action_attrs action_type; > +}; > + > +struct xdp_flow_actions { > + struct xdp_flow_actions_header header; > + uint8_t data[XDP_MAX_ACTIONS_LEN - sizeof(struct > xdp_flow_actions_header)]; > + /* Dummy. Need xdp_action_nlattr to keep enum action_attrs in BTF */ > + struct xdp_action_nlattr _xdp_actions[]; > +}; > + > + > +/* Map definitions */ > + > +struct { > + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); > + __uint(max_entries, 256); > + __type(key, uint32_t); > + __type(value, long); > +} debug_stats SEC(".maps"); > + > +/* Temporary storage for packet miniflow. Need this because verifier does not > + * allow access to array variable in stack with variable index. Such access > + * happens in mask_key() */ > +struct { > + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); > + __uint(max_entries, 1); > + __type(key, uint32_t); > + __type(value, struct xdp_miniflow); > +} pkt_mf_tbl SEC(".maps"); > + > +struct { > + __uint(type, BPF_MAP_TYPE_XSKMAP); > + __uint(max_entries, 1); /* This should be redefined by userspace */ > + __uint(key_size, sizeof(int)); > + __uint(value_size, sizeof(int)); > +} xsks_map SEC(".maps"); > + > +struct { > + __uint(type, BPF_MAP_TYPE_DEVMAP); > + __uint(max_entries, XDP_MAX_PORTS); > + __uint(key_size, sizeof(int)); > + __uint(value_size, sizeof(int)); > +} output_map SEC(".maps"); > + > +/* Head index for subtbl_masks list */ > +/* TODO: Use global variable */ > +struct { > + __uint(type, BPF_MAP_TYPE_ARRAY); > + __uint(max_entries, 1); > + __type(key, uint32_t); > + __type(value, int); > +} subtbl_masks_hd SEC(".maps"); > + > +/* Information about subtable mask. A list implemented using array */ > +struct { > + __uint(type, BPF_MAP_TYPE_ARRAY); > + __uint(max_entries, XDP_MAX_SUBTABLES); > + __type(key, uint32_t); > + __type(value, struct xdp_subtable_mask); > +} subtbl_masks SEC(".maps"); > + > +/* Template for subtable hash-map. This will be used in userspace to create > + * flow_table array-of-maps. */ > +struct { > + __uint(type, BPF_MAP_TYPE_HASH); > + __uint(max_entries, XDP_MAX_SUBTABLE_FLOWS); > + __type(key, struct xdp_flow_key); > + __type(value, struct xdp_flow_actions); > +} subtbl_template SEC(".maps"); > + > +/* Array-of-maps whose entry contains subtable hash-map. */ > +struct { > + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); > + __uint(max_entries, XDP_MAX_SUBTABLES); > + __uint(key_size, sizeof(uint32_t)); > + __uint(value_size, sizeof(uint32_t)); > +} flow_table SEC(".maps"); > + > + > +static inline void > +account_debug(int idx) > +{ > + long *cnt; > + > + cnt = bpf_map_lookup_elem(&debug_stats, &idx); > + if (cnt) { > + *cnt += 1; > + } > +} > + > +static inline void > +account_action(enum action_attrs act) > +{ > + account_debug(act + 1); > +} > + > +/* Derived from xsk_load_xdp_prog() in libbpf */ > +static inline int > +upcall(struct xdp_md *ctx) > +{ > + int ret, index = ctx->rx_queue_index; > + > + ret = bpf_redirect_map(&xsks_map, index, XDP_ABORTED); > + if (ret > 0) { > + return ret; > + } > + > + /* Fallback for kernel <= 5.3 not supporting default action in flags */ > + if (bpf_map_lookup_elem(&xsks_map, &index)) { > + return bpf_redirect_map(&xsks_map, index, 0); > + } > + > + return XDP_ABORTED; > +} > + > +static inline int > +action_output(int tx_port) > +{ > + account_action(XDP_ACTION_OUTPUT); > + > + return bpf_redirect_map(&output_map, tx_port, 0); > +} > + > +static inline int > +action_vlan_push(struct xdp_md *ctx OVS_UNUSED, > + const struct ovs_action_push_vlan *vlan OVS_UNUSED) > +{ > + account_action(XDP_ACTION_PUSH_VLAN); > + > + /* TODO: implement this */ > + return XDP_ABORTED; > +} > + > +static inline int > +action_vlan_pop(struct xdp_md *ctx OVS_UNUSED) > +{ > + account_action(XDP_ACTION_POP_VLAN); > + > + /* TODO: implement this */ > + return XDP_ABORTED; > +} > + > +/* TODO: Add more actions */ > + > + > +struct nw_params { > + union { > + ovs_be32 params; > + struct { > + uint8_t nw_frag; > + uint8_t nw_tos; > + uint8_t nw_ttl; > + uint8_t nw_proto; > + }; > + }; > +}; > + > +static inline int > +parse_ipv4(void *data, uint64_t *nh_off, void *data_end, > + struct bpf_mf_ctx *mf_ctx, struct nw_params *nw_params) > +{ > + struct ip_header *ip = data + *nh_off; > + > + if (ip + 1 > data_end) { > + return 1; > + } > + > + /* Linux network drivers ensure that IP header is 4-byte aligned or > + * the platform can handle unaligned access */ > + miniflow_push_be32(mf_ctx, nw_src, *(ovs_be32 *)(void *)&ip->ip_src); > + miniflow_push_be32(mf_ctx, nw_dst, *(ovs_be32 *)(void *)&ip->ip_dst); > + > + if (OVS_UNLIKELY(IP_IS_FRAGMENT(ip->ip_frag_off))) { > + nw_params->nw_frag = FLOW_NW_FRAG_ANY; > + if (ip->ip_frag_off & htons(IP_FRAG_OFF_MASK)) { > + nw_params->nw_frag |= FLOW_NW_FRAG_LATER; > + } > + } else { > + nw_params->nw_frag = 0; > + } > + nw_params->nw_tos = ip->ip_tos; > + nw_params->nw_ttl = ip->ip_ttl; > + nw_params->nw_proto = ip->ip_proto; > + > + *nh_off += IP_IHL(ip->ip_ihl_ver) * 4; > + > + return 0; > +} > + > +static inline int > +xdp_miniflow_extract(struct xdp_md *ctx, struct xdp_miniflow *pkt_mf) > +{ > + void *data = (void *)(long)ctx->data; > + void *data_end = (void *)(long)ctx->data_end; > + struct eth_header *eth = data; > + struct vlan_header *vlan = NULL; > + ovs_be16 dl_type; > + uint64_t nh_off; > + struct nw_params nw_params; > + struct bpf_mf_ctx mf_ctx = { {{ 0 }}, (uint64_t *)&pkt_mf->value }; > + > + nh_off = sizeof *eth; > + if (data + nh_off > data_end) { > + return 1; > + } > + > + miniflow_push_macs(&mf_ctx, dl_dst, ð->eth_dst); > + > + if (eth_type_vlan(eth->eth_type)) { > + vlan = data + nh_off; > + nh_off += sizeof(*vlan); > + if (data + nh_off > data_end) { > + return 1; > + } > + dl_type = vlan->vlan_next_type; > + } else { > + dl_type = eth->eth_type; > + } > + miniflow_push_be16(&mf_ctx, dl_type, dl_type); > + miniflow_pad_to_64(&mf_ctx, dl_type); > + > + if (vlan) { > + const ovs_16aligned_be32 *qp; > + union flow_vlan_hdr vlan_hdr; > + > + qp = (ovs_16aligned_be32 *)ð->eth_type; > + vlan_hdr.qtag = get_16aligned_be32(qp); > + vlan_hdr.tci |= htons(VLAN_CFI); > + miniflow_push_be32(&mf_ctx, vlans, vlan_hdr.qtag); > + miniflow_push_be32_(&mf_ctx, > + offsetof(struct flow, vlans) + sizeof(ovs_be32), > + 0); > + } > + > + if (dl_type == htons(ETH_TYPE_IP)) { > + if (parse_ipv4(data, &nh_off, data_end, &mf_ctx, &nw_params)) { > + return 1; > + } > + } else { > + goto out; > + } > + miniflow_pad_from_64(&mf_ctx, nw_frag); > + miniflow_push_be32(&mf_ctx, nw_frag, &nw_params.params); > + > + if (nw_params.nw_proto == IPPROTO_TCP) { > + struct tcp_header *tcp = data + nh_off; > + > + if (tcp + 1 > data_end) { > + return 1; > + } > + > + miniflow_push_be16(&mf_ctx, tp_src, tcp->tcp_src); > + miniflow_push_be16(&mf_ctx, tp_dst, tcp->tcp_dst); > + } else if (nw_params.nw_proto == IPPROTO_UDP) { > + struct udp_header *udp = data + nh_off; > + > + if (udp + 1 > data_end) { > + return 1; > + } > + > + miniflow_push_be16(&mf_ctx, tp_src, udp->udp_src); > + miniflow_push_be16(&mf_ctx, tp_dst, udp->udp_dst); > + } > +out: > + pkt_mf->mf.map = mf_ctx.map; > + return 0; > +} > + > +#define for_each_subtable_mask(subtable_mask, head, idx, cnt) \ > + for (subtable_mask = bpf_map_lookup_elem(&subtbl_masks, (head)), \ > + idx = *(head), cnt = 0; \ > + subtable_mask != NULL && cnt < XDP_MAX_SUBTABLES; \ > + idx = subtable_mask->header.next, \ > + subtable_mask = bpf_map_lookup_elem(&subtbl_masks, &idx), cnt++) > + > +/* Returns false if an error happens */ > +static inline int > +mask_key(uint64_t *mkey, const struct miniflow *pkt_mf, > + const struct xdp_subtable_mask_header *tbl_mask) > +{ > + const struct miniflow *tbl_mf = &tbl_mask->mask.masks; > + const uint64_t *tbl_blocks = miniflow_get_values(tbl_mf); > + const uint64_t *pkt_blocks = miniflow_get_values(pkt_mf); > + uint64_t tbl_mf_bits = tbl_mf->map.bits[0]; > + uint64_t pkt_mf_bits = pkt_mf->map.bits[0]; > + uint8_t tbl_mf_bits_u0 = tbl_mask->mf_bits_u0; > + uint8_t tbl_mf_bits_u1 = tbl_mask->mf_bits_u1; > + unsigned int pkt_ofs = 0; > + int i = 0; > + > + /* This sensitive loop easily exceeds verifier limit 1M insns so > + * need to be careful when modifying. > + * E.g. increasing XDP_FLOW_U64S by adding keys to struct xdp_flow > + * increases verifier cost and may be rejected due to 1M insns exceeds */ > + for (; i < tbl_mf_bits_u0 + tbl_mf_bits_u1 && i < XDP_FLOW_U64S; i++) { > + uint64_t mf_mask; > + uint64_t idx_bits; > + unsigned int pkt_idx; > + uint64_t lowest_bit; > + > + if (i == tbl_mf_bits_u0) { > + tbl_mf_bits = tbl_mf->map.bits[1]; > + pkt_mf_bits = pkt_mf->map.bits[1]; > + pkt_ofs = count_1bits(pkt_mf->map.bits[0]); > + } > + > + lowest_bit = tbl_mf_bits & -tbl_mf_bits; > + tbl_mf_bits &= ~lowest_bit; > + if (!(lowest_bit & pkt_mf_bits)) { > + mkey[i] = 0; > + continue; > + } > + mf_mask = lowest_bit - 1; > + idx_bits = mf_mask & pkt_mf_bits; > + pkt_idx = count_1bits(idx_bits) + pkt_ofs; > + if (pkt_idx >= XDP_FLOW_U64S) { > + /* xdp flow api provider (userspace) BUG */ > + return false; > + } > + > + mkey[i] = pkt_blocks[pkt_idx] & tbl_blocks[i]; > + } > + > + return true; > +} > + > +SEC("xdp") int > +flowtable_afxdp(struct xdp_md *ctx) > +{ > + struct xdp_miniflow *pkt_mf; > + struct xdp_subtable_mask *subtable_mask; > + int *head; > + struct xdp_flow_actions *xdp_actions = NULL; > + struct nlattr *a; > + unsigned int left; > + int cnt, idx, zero = 0; > + > + account_debug(0); > + > + head = bpf_map_lookup_elem(&subtbl_masks_hd, &zero); > + if (!head) { > + return XDP_ABORTED; > + } > + if (*head == XDP_SUBTABLES_TAIL) { > + /* Offload not enabled */ > + goto upcall; > + } > + > + /* Get temporary storage for storing packet miniflow */ > + pkt_mf = bpf_map_lookup_elem(&pkt_mf_tbl, &zero); > + if (!pkt_mf) { > + return XDP_ABORTED; > + } > + > + /* Extract miniflow from packet */ > + if (xdp_miniflow_extract(ctx, pkt_mf)) { > + return XDP_DROP; > + } > + > + /* Lookup each subtable */ > + for_each_subtable_mask(subtable_mask, head, idx, cnt) { > + struct xdp_flow_key mkey = { 0 }; > + void *subtable; > + > + subtable = bpf_map_lookup_elem(&flow_table, &idx); > + if (!subtable) { > + return XDP_ABORTED; > + } > + > + if (!mask_key(mkey.miniflow_buf, &pkt_mf->mf, > + &subtable_mask->header)) { > + continue; > + } > + > + xdp_actions = bpf_map_lookup_elem(subtable, &mkey); > + if (xdp_actions) { > + break; > + } > + } > + > + if (!xdp_actions) { > + /* Flow entry miss */ > +upcall: > + return upcall(ctx); > + } > + > + /* Execute actions */ > + NL_ATTR_FOR_EACH_UNSAFE(a, left, xdp_flow_actions(&xdp_actions->header), > + xdp_actions->header.actions_len) { > + uint16_t type = bpf_nl_attr_type(a); > + int act; > + > + switch ((enum action_attrs)type) { > + case XDP_ACTION_OUTPUT: > + /* Note: userspace ensures there is no multiple output in > actions */ > + return action_output(*(int *)bpf_nl_attr_get(a)); > + case XDP_ACTION_PUSH_VLAN: > + act = action_vlan_push(ctx, bpf_nl_attr_get(a)); > + break; > + case XDP_ACTION_POP_VLAN: > + act = action_vlan_pop(ctx); > + break; > + default: > + return XDP_ABORTED; > + } > + if (act != _XDP_ACTION_CONTINUE) { > + return act; > + } > + } > + > + account_debug(1); > + return XDP_DROP; > +} > + > +char _license[] SEC("license") = "Apache-2.0"; > diff --git a/configure.ac b/configure.ac > index 1877aae56..99a93ce00 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -99,6 +99,7 @@ OVS_CHECK_DOT > OVS_CHECK_IF_DL > OVS_CHECK_STRTOK_R > OVS_CHECK_LINUX_AF_XDP > +OVS_CHECK_LINUX_BPF > AC_CHECK_DECLS([sys_siglist], [], [], [[#include <signal.h>]]) > AC_CHECK_MEMBERS([struct stat.st_mtim.tv_nsec, struct stat.st_mtimensec], > [], [], [[#include <sys/stat.h>]]) > @@ -198,6 +199,7 @@ AC_CONFIG_FILES(datapath/Makefile) > AC_CONFIG_FILES(datapath/linux/Kbuild) > AC_CONFIG_FILES(datapath/linux/Makefile) > AC_CONFIG_FILES(datapath/linux/Makefile.main) > +AC_CONFIG_FILES(bpf/Makefile) > AC_CONFIG_FILES(tests/atlocal) > AC_CONFIG_FILES(lib/libopenvswitch.pc) > AC_CONFIG_FILES(lib/libsflow.pc) > -- > 2.25.1 > > _______________________________________________ > dev mailing list > [email protected] > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
-- Best regards, Tonghao _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
