I folded in the following incremental locally:
--- a/lib/ipf.c
+++ b/lib/ipf.c
@@ -533,8 +533,6 @@ ipf_list_state_transition(struct ipf *ipf, struct
ipf_list *ipf_list,
case IPF_LIST_STATE_LAST_SEEN:
if (ff) {
next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
- } else if (lf) {
- next_state = IPF_LIST_STATE_LAST_SEEN;
} else {
next_state = IPF_LIST_STATE_LAST_SEEN;
}
@@ -768,7 +766,7 @@ ipf_list_key_eq(const struct ipf_list_key *key1,
static struct ipf_list *
ipf_list_key_lookup(struct ipf *ipf, const struct ipf_list_key *key,
uint32_t hash)
- /* OVS_REQUIRES(ipf->ipf_lock) */
+ OVS_REQUIRES(ipf->ipf_lock)
{
struct ipf_list *ipf_list;
HMAP_FOR_EACH_WITH_HASH (ipf_list, node, hash, &ipf->frag_lists) {
On Wed, Feb 13, 2019 at 3:34 PM Darrell Ball <[email protected]> wrote:
> Fragmentation handling is added for supporting conntrack.
> Both v4 and v6 are supported.
>
> After discussion with several people, I decided to not store
> configuration state in the database to be more consistent with
> the kernel in future, similarity with other conntrack configuration
> which will not be in the database as well and overall simplicity.
> Accordingly, fragmentation handling is enabled by default.
>
> This patch enables fragmentation tests for the userspace datapath.
>
> Signed-off-by: Darrell Ball <[email protected]>
> ---
> Documentation/faq/releases.rst | 51 +-
> NEWS | 10 +-
> include/sparse/netinet/ip6.h | 1 +
> lib/automake.mk | 4 +-
> lib/conntrack.c | 22 +-
> lib/conntrack.h | 4 +
> lib/ct-dpif.c | 58 +-
> lib/ct-dpif.h | 12 +-
> lib/dpctl.c | 215 +++++-
> lib/dpctl.man | 36 +
> lib/dpif-netdev.c | 65 +-
> lib/dpif-netlink.c | 9 +-
> lib/dpif-provider.h | 53 +-
> lib/ipf.c | 1528
> ++++++++++++++++++++++++++++++++++++++
> lib/ipf.h | 63 ++
> tests/system-kmod-macros.at | 46 +-
> tests/system-traffic.at | 51 +-
> tests/system-userspace-macros.at | 186 ++++-
> 18 files changed, 2332 insertions(+), 82 deletions(-)
> create mode 100644 lib/ipf.c
> create mode 100644 lib/ipf.h
>
> diff --git a/Documentation/faq/releases.rst
> b/Documentation/faq/releases.rst
> index cd7254b..a78152b 100644
> --- a/Documentation/faq/releases.rst
> +++ b/Documentation/faq/releases.rst
> @@ -105,31 +105,32 @@ Q: Are all features available with all datapaths?
> The following table lists the datapath supported features from an Open
> vSwitch user's perspective.
>
> - ===================== ============== ============== ========= =======
> - Feature Linux upstream Linux OVS tree Userspace Hyper-V
> - ===================== ============== ============== ========= =======
> - NAT 4.6 YES Yes NO
> - Connection tracking 4.3 YES PARTIAL PARTIAL
> - Tunnel - LISP NO YES NO NO
> - Tunnel - STT NO YES NO YES
> - Tunnel - GRE 3.11 YES YES YES
> - Tunnel - VXLAN 3.12 YES YES YES
> - Tunnel - Geneve 3.18 YES YES YES
> - Tunnel - GRE-IPv6 4.18 YES YES NO
> - Tunnel - VXLAN-IPv6 4.3 YES YES NO
> - Tunnel - Geneve-IPv6 4.4 YES YES NO
> - Tunnel - ERSPAN 4.18 YES YES NO
> - Tunnel - ERSPAN-IPv6 4.18 YES YES NO
> - QoS - Policing YES YES YES NO
> - QoS - Shaping YES YES NO NO
> - sFlow YES YES YES NO
> - IPFIX 3.10 YES YES NO
> - Set action YES YES YES PARTIAL
> - NIC Bonding YES YES YES YES
> - Multiple VTEPs YES YES YES YES
> - Meters 4.15 YES YES NO
> - Conntrack zone limit 4.18 YES NO NO
> - ===================== ============== ============== ========= =======
> + ========================== ============== ============== =========
> =======
> + Feature Linux upstream Linux OVS tree Userspace
> Hyper-V
> + ========================== ============== ============== =========
> =======
> + Connection tracking 4.3 YES YES
> YES
> + Conntrack Fragment Reass. 4.3 YES YES
> YES
> + NAT 4.6 YES YES
> NO
> + Conntrack zone limit 4.18 YES NO
> NO
> + Tunnel - LISP NO YES NO
> NO
> + Tunnel - STT NO YES NO
> YES
> + Tunnel - GRE 3.11 YES YES
> YES
> + Tunnel - VXLAN 3.12 YES YES
> YES
> + Tunnel - Geneve 3.18 YES YES
> YES
> + Tunnel - GRE-IPv6 NO NO YES
> NO
> + Tunnel - VXLAN-IPv6 4.3 YES YES
> NO
> + Tunnel - Geneve-IPv6 4.4 YES YES
> NO
> + Tunnel - ERSPAN 4.18 YES YES
> NO
> + Tunnel - ERSPAN-IPv6 4.18 YES YES
> NO
> + QoS - Policing YES YES YES
> NO
> + QoS - Shaping YES YES NO
> NO
> + sFlow YES YES YES
> NO
> + IPFIX 3.10 YES YES
> NO
> + Set action YES YES YES
> PARTIAL
> + NIC Bonding YES YES YES
> YES
> + Multiple VTEPs YES YES YES
> YES
> + Meters 4.15 YES YES
> NO
> + ========================== ============== ============== =========
> =======
>
> Do note, however:
>
> diff --git a/NEWS b/NEWS
> index ccc0bfb..2dabb97 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -8,7 +8,15 @@ Post-v2.11.0
> - Userspace datapath:
> * ICMPv6 ND enhancements: support for match and set ND options type
> and reserved fields.
> -
> + * Add v4/v6 fragmentation support for conntrack.
> + * New ovs-appctl "dpctl/ipf-set-enabled" and "dpctl/ipf-set-disabled"
> + commands for userspace datapath conntrack fragmentation support.
> + * New "ovs-appctl dpctl/ipf-set-min-frag" command for userspace
> + datapath conntrack fragmentation support.
> + * New "ovs-appctl dpctl/ipf-set-max-nfrags" command for userspace
> datapath
> + conntrack fragmentation support.
> + * New "ovs-appctl dpctl/ipf-get-status" command for userspace
> datapath
> + conntrack fragmentation support.
>
> v2.11.0 - xx xxx xxxx
> ---------------------
> diff --git a/include/sparse/netinet/ip6.h b/include/sparse/netinet/ip6.h
> index d2a54de..bfa637a 100644
> --- a/include/sparse/netinet/ip6.h
> +++ b/include/sparse/netinet/ip6.h
> @@ -64,5 +64,6 @@ struct ip6_frag {
> };
>
> #define IP6F_OFF_MASK ((OVS_FORCE ovs_be16) 0xfff8)
> +#define IP6F_MORE_FRAG ((OVS_FORCE ovs_be16) 0x0001)
>
> #endif /* netinet/ip6.h sparse */
> diff --git a/lib/automake.mk b/lib/automake.mk
> index ba10410..bae032b 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -1,4 +1,4 @@
> -# Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017
> Nicira, Inc.
> +# Copyright (C) 2009-2018 Nicira, Inc.
> #
> # Copying and distribution of this file, with or without modification,
> # are permitted in any medium without royalty provided the copyright
> @@ -108,6 +108,8 @@ lib_libopenvswitch_la_SOURCES = \
> lib/hmapx.h \
> lib/id-pool.c \
> lib/id-pool.h \
> + lib/ipf.c \
> + lib/ipf.h \
> lib/jhash.c \
> lib/jhash.h \
> lib/json.c \
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index a044a69..f04cf0f 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2015, 2016, 2017 Nicira, Inc.
> + * Copyright (c) 2015-2019 Nicira, Inc.
> *
> * Licensed under the Apache License, Version 2.0 (the "License");
> * you may not use this file except in compliance with the License.
> @@ -30,6 +30,7 @@
> #include "ct-dpif.h"
> #include "dp-packet.h"
> #include "flow.h"
> +#include "ipf.h"
> #include "netdev.h"
> #include "odp-netlink.h"
> #include "openvswitch/hmap.h"
> @@ -340,6 +341,7 @@ conntrack_init(struct conntrack *ct)
> atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
> latch_init(&ct->clean_thread_exit);
> ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main,
> ct);
> + ct->ipf = ipf_init();
> }
>
> /* Destroys the connection tracker 'ct' and frees all the allocated
> memory. */
> @@ -382,6 +384,7 @@ conntrack_destroy(struct conntrack *ct)
> hindex_destroy(&ct->alg_expectation_refs);
> ct_rwlock_unlock(&ct->resources_lock);
> ct_rwlock_destroy(&ct->resources_lock);
> + ipf_destroy(ct->ipf);
> }
>
> static unsigned hash_to_bucket(uint32_t hash)
> @@ -1299,7 +1302,8 @@ process_one(struct conntrack *ct, struct dp_packet
> *pkt,
>
> /* Sends the packets in '*pkt_batch' through the connection tracker
> 'ct'. All
> * the packets must have the same 'dl_type' (IPv4 or IPv6) and should have
> - * the l3 and and l4 offset properly set.
> + * the l3 and and l4 offset properly set. Performs fragment reassembly
> with
> + * the help of ipf_preprocess_conntrack().
> *
> * If 'commit' is true, the packets are allowed to create new entries in
> the
> * connection tables. 'setmark', if not NULL, should point to a two
> @@ -1314,11 +1318,15 @@ conntrack_execute(struct conntrack *ct, struct
> dp_packet_batch *pkt_batch,
> const struct nat_action_info_t *nat_action_info,
> long long now)
> {
> + ipf_preprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone,
> + ct->hash_basis);
> +
> struct dp_packet *packet;
> struct conn_lookup_ctx ctx;
>
> DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) {
> - if (!conn_key_extract(ct, packet, dl_type, &ctx, zone)) {
> + if (packet->md.ct_state == CS_INVALID
> + || !conn_key_extract(ct, packet, dl_type, &ctx, zone)) {
> packet->md.ct_state = CS_INVALID;
> write_ct_md(packet, zone, NULL, NULL, NULL);
> continue;
> @@ -1327,6 +1335,8 @@ conntrack_execute(struct conntrack *ct, struct
> dp_packet_batch *pkt_batch,
> setlabel, nat_action_info, tp_src, tp_dst, helper);
> }
>
> + ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type);
> +
> return 0;
> }
>
> @@ -2484,6 +2494,12 @@ conn_to_ct_dpif_entry(const struct conn *conn,
> struct ct_dpif_entry *entry,
> }
> }
>
> +void *
> +conntrack_ipf_ctx(struct conntrack *ct)
> +{
> + return ct->ipf;
> +}
> +
> int
> conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
> const uint16_t *pzone, int *ptot_bkts)
> diff --git a/lib/conntrack.h b/lib/conntrack.h
> index e3a5dcc..0c0915e 100644
> --- a/lib/conntrack.h
> +++ b/lib/conntrack.h
> @@ -122,6 +122,7 @@ int conntrack_flush_tuple(struct conntrack *, const
> struct ct_dpif_tuple *,
> int conntrack_set_maxconns(struct conntrack *ct, uint32_t maxconns);
> int conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns);
> int conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns);
> +void *conntrack_ipf_ctx(struct conntrack *ct);
>
> /* 'struct ct_lock' is a wrapper for an adaptive mutex. It's useful to
> try
> * different types of locks (e.g. spinlocks) */
> @@ -293,6 +294,9 @@ struct conntrack {
> */
> struct ct_rwlock resources_lock;
>
> + /* Fragmentation handling context. */
> + void *ipf;
> +
> };
>
> #endif /* conntrack.h */
> diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
> index 67eccd0..b2c9b43 100644
> --- a/lib/ct-dpif.c
> +++ b/lib/ct-dpif.c
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2015 Nicira, Inc.
> + * Copyright (c) 2015, 2018 Nicira, Inc.
> *
> * Licensed under the Apache License, Version 2.0 (the "License");
> * you may not use this file except in compliance with the License.
> @@ -194,6 +194,62 @@ ct_dpif_del_limits(struct dpif *dpif, const struct
> ovs_list *zone_limits)
> : EOPNOTSUPP);
> }
>
> +int
> +ct_dpif_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
> +{
> + return (dpif->dpif_class->ipf_set_enabled
> + ? dpif->dpif_class->ipf_set_enabled(dpif, v6, enable)
> + : EOPNOTSUPP);
> +}
> +
> +int
> +ct_dpif_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
> +{
> + return (dpif->dpif_class->ipf_set_min_frag
> + ? dpif->dpif_class->ipf_set_min_frag(dpif, v6, min_frag)
> + : EOPNOTSUPP);
> +}
> +
> +int
> +ct_dpif_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
> +{
> + return (dpif->dpif_class->ipf_set_max_nfrags
> + ? dpif->dpif_class->ipf_set_max_nfrags(dpif, max_frags)
> + : EOPNOTSUPP);
> +}
> +
> +int ct_dpif_ipf_get_status(struct dpif *dpif,
> + struct dpif_ipf_status *dpif_ipf_status)
> +{
> + return (dpif->dpif_class->ipf_get_status
> + ? dpif->dpif_class->ipf_get_status(dpif, dpif_ipf_status)
> + : EOPNOTSUPP);
> +}
> +
> +int
> +ct_dpif_ipf_dump_start(struct dpif *dpif, struct ipf_dump_ctx **dump_ctx)
> +{
> + return (dpif->dpif_class->ipf_dump_start
> + ? dpif->dpif_class->ipf_dump_start(dpif, dump_ctx)
> + : EOPNOTSUPP);
> +}
> +
> +int
> +ct_dpif_ipf_dump_next(struct dpif *dpif, void *dump_ctx, char **dump)
> +{
> + return (dpif->dpif_class->ipf_dump_next
> + ? dpif->dpif_class->ipf_dump_next(dpif, dump_ctx, dump)
> + : EOPNOTSUPP);
> +}
> +
> +int
> +ct_dpif_ipf_dump_done(struct dpif *dpif, void *dump_ctx)
> +{
> + return (dpif->dpif_class->ipf_dump_done
> + ? dpif->dpif_class->ipf_dump_done(dpif, dump_ctx)
> + : EOPNOTSUPP);
> +}
> +
> void
> ct_dpif_entry_uninit(struct ct_dpif_entry *entry)
> {
> diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h
> index decc14f..0151cfe 100644
> --- a/lib/ct-dpif.h
> +++ b/lib/ct-dpif.h
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2015 Nicira, Inc.
> + * Copyright (c) 2015, 2018 Nicira, Inc.
> *
> * Licensed under the Apache License, Version 2.0 (the "License");
> * you may not use this file except in compliance with the License.
> @@ -186,6 +186,8 @@ enum {
> };
>
> struct dpif;
> +struct dpif_ipf_status;
> +struct ipf_dump_ctx;
>
> struct ct_dpif_dump_state {
> struct dpif *dpif;
> @@ -212,6 +214,14 @@ int ct_dpif_set_limits(struct dpif *dpif, const
> uint32_t *default_limit,
> int ct_dpif_get_limits(struct dpif *dpif, uint32_t *default_limit,
> const struct ovs_list *, struct ovs_list *);
> int ct_dpif_del_limits(struct dpif *dpif, const struct ovs_list *);
> +int ct_dpif_ipf_set_enabled(struct dpif *, bool v6, bool enable);
> +int ct_dpif_ipf_set_min_frag(struct dpif *, bool v6, uint32_t min_frag);
> +int ct_dpif_ipf_set_max_nfrags(struct dpif *, uint32_t max_frags);
> +int ct_dpif_ipf_get_status(struct dpif *dpif,
> + struct dpif_ipf_status *dpif_ipf_status);
> +int ct_dpif_ipf_dump_start(struct dpif *dpif, struct ipf_dump_ctx **);
> +int ct_dpif_ipf_dump_next(struct dpif *dpif, void *, char **);
> +int ct_dpif_ipf_dump_done(struct dpif *dpif, void *);
> void ct_dpif_entry_uninit(struct ct_dpif_entry *);
> void ct_dpif_format_entry(const struct ct_dpif_entry *, struct ds *,
> bool verbose, bool print_stats);
> diff --git a/lib/dpctl.c b/lib/dpctl.c
> index 59071cd..f5a09b7 100644
> --- a/lib/dpctl.c
> +++ b/lib/dpctl.c
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2008-2017 Nicira, Inc.
> + * Copyright (c) 2008-2018 Nicira, Inc.
> *
> * Licensed under the Apache License, Version 2.0 (the "License");
> * you may not use this file except in compliance with the License.
> @@ -33,6 +33,7 @@
> #include "dirs.h"
> #include "dpctl.h"
> #include "dpif.h"
> +#include "dpif-provider.h"
> #include "openvswitch/dynamic-string.h"
> #include "flow.h"
> #include "openvswitch/match.h"
> @@ -1917,6 +1918,210 @@ out:
> return error;
> }
>
> +static int
> +ipf_set_enabled__(int argc, const char *argv[], struct dpctl_params
> *dpctl_p,
> + bool enabled)
> +{
> + struct dpif *dpif;
> + int error = opt_dpif_open(argc, argv, dpctl_p, 4, &dpif);
> + if (!error) {
> + char v4_or_v6[3] = {0};
> + if (ovs_scan(argv[argc - 1], "%2s", v4_or_v6) &&
> + (!strncmp(v4_or_v6, "v4", 2) || !strncmp(v4_or_v6, "v6", 2)))
> {
> + error = ct_dpif_ipf_set_enabled(
> + dpif, !strncmp(v4_or_v6, "v6", 2), enabled);
> + if (!error) {
> + dpctl_print(dpctl_p,
> + "%s fragmentation reassembly successful",
> + enabled ? "enabling" : "disabling");
> + } else {
> + dpctl_error(dpctl_p, error,
> + "%s fragmentation reassembly failed",
> + enabled ? "enabling" : "disabling");
> + }
> + } else {
> + error = EINVAL;
> + dpctl_error(dpctl_p, error,
> + "parameter missing: 'v4' for IPv4 or 'v6' for
> IPv6");
> + }
> + dpif_close(dpif);
> + }
> + return error;
> +}
> +
> +static int
> +dpctl_ipf_set_enabled(int argc, const char *argv[],
> + struct dpctl_params *dpctl_p)
> +{
> + return ipf_set_enabled__(argc, argv, dpctl_p, true);
> +}
> +
> +static int
> +dpctl_ipf_set_disabled(int argc, const char *argv[],
> + struct dpctl_params *dpctl_p)
> +{
> + return ipf_set_enabled__(argc, argv, dpctl_p, false);
> +}
> +
> +static int
> +dpctl_ipf_set_min_frag(int argc, const char *argv[],
> + struct dpctl_params *dpctl_p)
> +{
> + struct dpif *dpif;
> + int error = opt_dpif_open(argc, argv, dpctl_p, 4, &dpif);
> + if (!error) {
> + char v4_or_v6[3] = {0};
> + if (ovs_scan(argv[argc - 2], "%2s", v4_or_v6) &&
> + (!strncmp(v4_or_v6, "v4", 2) || !strncmp(v4_or_v6, "v6", 2)))
> {
> + uint32_t min_fragment;
> + if (ovs_scan(argv[argc - 1], "%"SCNu32, &min_fragment)) {
> + error = ct_dpif_ipf_set_min_frag(
> + dpif, !strncmp(v4_or_v6, "v6", 2),
> min_fragment);
> + if (!error) {
> + dpctl_print(dpctl_p,
> + "setting minimum fragment size
> successful");
> + } else {
> + dpctl_error(dpctl_p, error,
> + "requested minimum fragment size too
> small;"
> + " see documentation");
> + }
> + } else {
> + error = EINVAL;
> + dpctl_error(dpctl_p, error,
> + "parameter missing for minimum fragment
> size");
> + }
> + } else {
> + error = EINVAL;
> + dpctl_error(dpctl_p, error,
> + "parameter missing: v4 for IPv4 or v6 for IPv6");
> + }
> + dpif_close(dpif);
> + }
> +
> + return error;
> +}
> +
> +static int
> +dpctl_ipf_set_max_nfrags(int argc, const char *argv[],
> + struct dpctl_params *dpctl_p)
> +{
> + struct dpif *dpif;
> + int error = opt_dpif_open(argc, argv, dpctl_p, 3, &dpif);
> + if (!error) {
> + uint32_t nfrags_max;
> + if (ovs_scan(argv[argc - 1], "%"SCNu32, &nfrags_max)) {
> + error = ct_dpif_ipf_set_max_nfrags(dpif, nfrags_max);
> + if (!error) {
> + dpctl_print(dpctl_p,
> + "setting maximum fragments successful");
> + } else {
> + dpctl_error(dpctl_p, error,
> + "setting maximum fragments failed");
> + }
> + } else {
> + error = EINVAL;
> + dpctl_error(dpctl_p, error,
> + "parameter missing for maximum fragments");
> + }
> + dpif_close(dpif);
> + }
> +
> + return error;
> +}
> +
> +static void
> +dpctl_dump_ipf(struct dpif *dpif, struct dpctl_params *dpctl_p)
> +{
> + struct ipf_dump_ctx *dump_ctx;
> + char *dump;
> +
> + int error = ct_dpif_ipf_dump_start(dpif, &dump_ctx);
> + if (error) {
> + dpctl_error(dpctl_p, error, "starting ipf list dump");
> + /* Nothing to clean up, just return. */
> + return;
> + }
> +
> + dpctl_print(dpctl_p, "\n Fragment Lists:\n\n");
> + while (!(error = ct_dpif_ipf_dump_next(dpif, dump_ctx, &dump))) {
> + dpctl_print(dpctl_p, "%s\n", dump);
> + free(dump);
> + }
> +
> + if (error && error != EOF) {
> + dpctl_error(dpctl_p, error, "dumping ipf lists failed");
> + }
> +
> + ct_dpif_ipf_dump_done(dpif, dump_ctx);
> +}
> +
> +static int
> +dpctl_ct_ipf_get_status(int argc, const char *argv[],
> + struct dpctl_params *dpctl_p)
> +{
> + struct dpif *dpif;
> + int error = opt_dpif_open(argc, argv, dpctl_p, 2, &dpif);
> +
> + if (!error) {
> + struct dpif_ipf_status dpif_ipf_status;
> + error = ct_dpif_ipf_get_status(dpif, &dpif_ipf_status);
> +
> + if (!error) {
> + dpctl_print(dpctl_p, " Fragmentation Module Status\n");
> + dpctl_print(dpctl_p, " ---------------------------\n");
> + dpctl_print(dpctl_p, " v4 enabled: %u\n",
> + dpif_ipf_status.v4.enabled);
> + dpctl_print(dpctl_p, " v6 enabled: %u\n",
> + dpif_ipf_status.v6.enabled);
> + dpctl_print(dpctl_p, " max num frags (v4/v6): %u\n",
> + dpif_ipf_status.nfrag_max);
> + dpctl_print(dpctl_p, " num frag: %u\n",
> + dpif_ipf_status.nfrag);
> + dpctl_print(dpctl_p, " min v4 frag size: %u\n",
> + dpif_ipf_status.v4.min_frag_size);
> + dpctl_print(dpctl_p, " v4 frags accepted: %"PRIu64"\n",
> + dpif_ipf_status.v4.nfrag_accepted);
> + dpctl_print(dpctl_p, " v4 frags completed:
> %"PRIu64"\n",
> + dpif_ipf_status.v4.nfrag_completed_sent);
> + dpctl_print(dpctl_p, " v4 frags expired: %"PRIu64"\n",
> + dpif_ipf_status.v4.nfrag_expired_sent);
> + dpctl_print(dpctl_p, " v4 frags too small:
> %"PRIu64"\n",
> + dpif_ipf_status.v4.nfrag_too_small);
> + dpctl_print(dpctl_p, " v4 frags overlapped:
> %"PRIu64"\n",
> + dpif_ipf_status.v4.nfrag_overlap);
> + dpctl_print(dpctl_p, " v4 frags purged: %"PRIu64"\n",
> + dpif_ipf_status.v4.nfrag_purged);
> +
> + dpctl_print(dpctl_p, " min v6 frag size: %u\n",
> + dpif_ipf_status.v6.min_frag_size);
> + dpctl_print(dpctl_p, " v6 frags accepted: %"PRIu64"\n",
> + dpif_ipf_status.v6.nfrag_accepted);
> + dpctl_print(dpctl_p, " v6 frags completed:
> %"PRIu64"\n",
> + dpif_ipf_status.v6.nfrag_completed_sent);
> + dpctl_print(dpctl_p, " v6 frags expired: %"PRIu64"\n",
> + dpif_ipf_status.v6.nfrag_expired_sent);
> + dpctl_print(dpctl_p, " v6 frags too small:
> %"PRIu64"\n",
> + dpif_ipf_status.v6.nfrag_too_small);
> + dpctl_print(dpctl_p, " v6 frags overlapped:
> %"PRIu64"\n",
> + dpif_ipf_status.v6.nfrag_overlap);
> + dpctl_print(dpctl_p, " v6 frags purged: %"PRIu64"\n",
> + dpif_ipf_status.v6.nfrag_purged);
> + } else {
> + dpctl_error(dpctl_p, error,
> + "ipf status could not be retrieved");
> + return error;
> + }
> +
> + if (dpctl_p->verbosity) {
> + dpctl_dump_ipf(dpif, dpctl_p);
> + }
> +
> + dpif_close(dpif);
> + }
> +
> + return error;
> +}
> +
> /* Undocumented commands for unit testing. */
>
> static int
> @@ -2222,6 +2427,14 @@ static const struct dpctl_command all_commands[] = {
> DP_RO },
> { "ct-get-limits", "[dp] [zone=N1[,N2]...]", 0, 2,
> dpctl_ct_get_limits,
> DP_RO },
> + { "ipf-set-enabled", "[dp] v4|v6", 1, 2, dpctl_ipf_set_enabled, DP_RW
> },
> + { "ipf-set-disabled", "[dp] v4|v6", 1, 2, dpctl_ipf_set_disabled,
> DP_RW },
> + { "ipf-set-min-frag", "[dp] v4|v6 minfragment", 2, 3,
> + dpctl_ipf_set_min_frag, DP_RW },
> + { "ipf-set-max-nfrags", "[dp] maxfrags", 1, 2,
> + dpctl_ipf_set_max_nfrags, DP_RW },
> + { "ipf-get-status", "[dp]", 0, 1, dpctl_ct_ipf_get_status,
> + DP_RO },
> { "help", "", 0, INT_MAX, dpctl_help, DP_RO },
> { "list-commands", "", 0, INT_MAX, dpctl_list_commands, DP_RO },
>
> diff --git a/lib/dpctl.man b/lib/dpctl.man
> index fe0aec9..f22029f 100644
> --- a/lib/dpctl.man
> +++ b/lib/dpctl.man
> @@ -220,6 +220,42 @@ nftables and the regular host stack). Therefore, the
> following commands
> do not apply specifically to one datapath.
> .
> .TP
> +\*(DX\fBipf\-set\-enabled\fR [\fIdp\fR] \fBv4\fR|\fBv6\fR
> +.TQ
> +\*(DX\fBipf\-set\-disabled\fR [\fIdp\fR] \fBv4\fR|\fBv6\fR
> +Enables or disables IP fragmentation handling for the userspace
> +connection tracker. Either \fBv4\fR or \fBv6\fR must be specified.
> +Both IPv4 and IPv6 fragment reassembly are enabled by default. Only
> +supported for the userspace datapath.
> +.
> +.TP
> +\*(DX\fBipf\-set\-min\-frag\fR [\fIdp\fR] \fBv4\fR|\fBv6\fR \fIminfrag\fR
> +Sets the minimum fragment size for non-final fragments to
> +\fIminfrag\fR. Either \fBv4\fR or \fBv6\fR must be specified. For
> +enhanced DOS security, higher minimum fragment sizes can usually be used.
> +The default IPv4 value is 1200 and the clamped minimum is 400. The
> default
> +IPv6 value is 1280, with a clamped minimum of 400, for testing
> +flexibility. The maximum fragment size is not clamped, however, setting
> +this value too high might result in valid fragments being dropped. Only
> +supported for userspace datapath.
> +.
> +.TP
> +\*(DX\fBipf\-set\-max\-nfrags\fR [\fIdp\fR] \fImaxfrags\fR
> +Sets the maximum number of fragments tracked by the userspace datapath
> +connection tracker to \fImaxfrags\fR. The default value is 1000 and the
> +clamped maximum is 5000. Note that packet buffers can be held by the
> +fragmentation module while fragments are incomplete, but will timeout
> +after 15 seconds. Memory pool sizing should be set accordingly when
> +fragmentation is enabled. Only supported for userspace datapath.
> +.
> +.TP
> +.DO "[\fB\-m\fR | \fB\-\-more\fR]" "\*(DX\fBipf\-get\-status\fR
> [\fIdp\fR]"
> +Gets the configuration settings and fragment counters associated with the
> +fragmentation handling of the userspace datapath connection tracker.
> +With \fB\-m\fR or \fB\-\-more\fR, also dumps the IP fragment lists.
> +Only supported for userspace datapath.
> +.
> +.TP
> .DO "[\fB\-m\fR | \fB\-\-more\fR] [\fB\-s\fR | \fB\-\-statistics\fR]"
> "\*(DX\fBdump\-conntrack\fR" "[\fIdp\fR] [\fBzone=\fIzone\fR]"
> Prints to the console all the connection entries in the tracker used by
> \fIdp\fR. If \fBzone=\fIzone\fR is specified, only shows the connections
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index f41f1d7..77ac1d2 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017 Nicira,
> Inc.
> + * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
> *
> * Licensed under the Apache License, Version 2.0 (the "License");
> * you may not use this file except in compliance with the License.
> @@ -47,6 +47,7 @@
> #include "flow.h"
> #include "hmapx.h"
> #include "id-pool.h"
> +#include "ipf.h"
> #include "latch.h"
> #include "netdev.h"
> #include "netdev-provider.h"
> @@ -7356,6 +7357,61 @@ dpif_netdev_ct_get_nconns(struct dpif *dpif,
> uint32_t *nconns)
> return conntrack_get_nconns(&dp->conntrack, nconns);
> }
>
> +static int
> +dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
> +{
> + struct dp_netdev *dp = get_dp_netdev(dpif);
> + return ipf_set_enabled(conntrack_ipf_ctx(&dp->conntrack), v6, enable);
> +}
> +
> +static int
> +dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t
> min_frag)
> +{
> + struct dp_netdev *dp = get_dp_netdev(dpif);
> + return ipf_set_min_frag(conntrack_ipf_ctx(&dp->conntrack), v6,
> min_frag);
> +}
> +
> +static int
> +dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
> +{
> + struct dp_netdev *dp = get_dp_netdev(dpif);
> + return ipf_set_max_nfrags(conntrack_ipf_ctx(&dp->conntrack),
> max_frags);
> +}
> +
> +/* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
> + * diverge. */
> +static int
> +dpif_netdev_ipf_get_status(struct dpif *dpif,
> + struct dpif_ipf_status *dpif_ipf_status)
> +{
> + struct dp_netdev *dp = get_dp_netdev(dpif);
> + ipf_get_status(conntrack_ipf_ctx(&dp->conntrack),
> + (struct ipf_status *) dpif_ipf_status);
> + return 0;
> +}
> +
> +static int
> +dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
> + struct ipf_dump_ctx **ipf_dump_ctx)
> +{
> + return ipf_dump_start(ipf_dump_ctx);
> +}
> +
> +static int
> +dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char
> **dump)
> +{
> + struct dp_netdev *dp = get_dp_netdev(dpif);
> + return ipf_dump_next(conntrack_ipf_ctx(&dp->conntrack), ipf_dump_ctx,
> + dump);
> +}
> +
> +static int
> +dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void
> *ipf_dump_ctx)
> +{
> + return ipf_dump_done(ipf_dump_ctx);
> +
> +}
> +
> const struct dpif_class dpif_netdev_class = {
> "netdev",
> dpif_netdev_init,
> @@ -7407,6 +7463,13 @@ const struct dpif_class dpif_netdev_class = {
> NULL, /* ct_set_limits */
> NULL, /* ct_get_limits */
> NULL, /* ct_del_limits */
> + dpif_netdev_ipf_set_enabled,
> + dpif_netdev_ipf_set_min_frag,
> + dpif_netdev_ipf_set_max_nfrags,
> + dpif_netdev_ipf_get_status,
> + dpif_netdev_ipf_dump_start,
> + dpif_netdev_ipf_dump_next,
> + dpif_netdev_ipf_dump_done,
> dpif_netdev_meter_get_features,
> dpif_netdev_meter_set,
> dpif_netdev_meter_get,
> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> index e23a35d..73641a5 100644
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2008-2017 Nicira, Inc.
> + * Copyright (c) 2008-2018 Nicira, Inc.
> *
> * Licensed under the Apache License, Version 2.0 (the "License");
> * you may not use this file except in compliance with the License.
> @@ -3429,6 +3429,13 @@ const struct dpif_class dpif_netlink_class = {
> dpif_netlink_ct_set_limits,
> dpif_netlink_ct_get_limits,
> dpif_netlink_ct_del_limits,
> + NULL, /* ipf_set_enabled */
> + NULL, /* ipf_set_min_frag */
> + NULL, /* ipf_set_max_nfrags */
> + NULL, /* ipf_get_status */
> + NULL, /* ipf_dump_start */
> + NULL, /* ipf_dump_next */
> + NULL, /* ipf_dump_done */
> dpif_netlink_meter_get_features,
> dpif_netlink_meter_set,
> dpif_netlink_meter_get,
> diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
> index 78e153c..b2a4dff 100644
> --- a/lib/dpif-provider.h
> +++ b/lib/dpif-provider.h
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
> + * Copyright (c) 2009-2014, 2018 Nicira, Inc.
> *
> * Licensed under the Apache License, Version 2.0 (the "License");
> * you may not use this file except in compliance with the License.
> @@ -42,6 +42,9 @@ struct dpif {
> long long int current_ms;
> };
>
> +struct dpif_ipf_status;
> +struct ipf_dump_ctx;
> +
> void dpif_init(struct dpif *, const struct dpif_class *, const char *name,
> uint8_t netflow_engine_type, uint8_t netflow_engine_id);
> void dpif_uninit(struct dpif *dpif, bool close);
> @@ -78,6 +81,27 @@ struct ct_dpif_dump_state;
> struct ct_dpif_entry;
> struct ct_dpif_tuple;
>
> +/* 'dpif_ipf_proto_status' and 'dpif_ipf_status' are presently in
> + * sync with 'ipf_proto_status' and 'ipf_status', but more
> + * generally represent a superset of present and future support. */
> +struct dpif_ipf_proto_status {
> + uint64_t nfrag_accepted;
> + uint64_t nfrag_completed_sent;
> + uint64_t nfrag_expired_sent;
> + uint64_t nfrag_too_small;
> + uint64_t nfrag_overlap;
> + uint64_t nfrag_purged;
> + unsigned int min_frag_size;
> + bool enabled;
> +};
> +
> +struct dpif_ipf_status {
> + struct dpif_ipf_proto_status v4;
> + struct dpif_ipf_proto_status v6;
> + unsigned int nfrag;
> + unsigned int nfrag_max;
> +};
> +
> /* Datapath interface class structure, to be defined by each
> implementation of
> * a datapath interface.
> *
> @@ -468,6 +492,33 @@ struct dpif_class {
> * list of 'struct ct_dpif_zone_limit' entries. */
> int (*ct_del_limits)(struct dpif *, const struct ovs_list
> *zone_limits);
>
> + /* IP Fragmentation. */
> +
> + /* Disables or enables conntrack fragment reassembly. The default
> + * setting is enabled. */
> + int (*ipf_set_enabled)(struct dpif *, bool v6, bool enabled);
> +
> + /* Set minimum fragment allowed. */
> + int (*ipf_set_min_frag)(struct dpif *, bool v6, uint32_t min_frag);
> +
> + /* Set maximum number of fragments tracked. */
> + int (*ipf_set_max_nfrags)(struct dpif *, uint32_t max_nfrags);
> +
> + /* Get fragmentation configuration status and counters. */
> + int (*ipf_get_status)(struct dpif *,
> + struct dpif_ipf_status *dpif_ipf_status);
> +
> + /* The following 3 apis find and print ipf lists by creating a string
> + * representation of the state of an ipf list, to which 'dump' is
> pointed
> + * to. 'ipf_dump_start()' allocates memory for 'ipf_dump_ctx'.
> + * 'ipf_dump_next()' finds the next ipf list and copies it's
> + * characteristics to a string, which is freed by the caller.
> + * 'ipf_dump_done()' frees the 'ipf_dump_ctx' that was allocated in
> + * 'ipf_dump_start'. */
> + int (*ipf_dump_start)(struct dpif *, struct ipf_dump_ctx
> **ipf_dump_ctx);
> + int (*ipf_dump_next)(struct dpif *, void *ipf_dump_ctx, char **dump);
> + int (*ipf_dump_done)(struct dpif *, void *ipf_dump_ctx);
> +
> /* Meters */
>
> /* Queries 'dpif' for supported meter features.
> diff --git a/lib/ipf.c b/lib/ipf.c
> new file mode 100644
> index 0000000..1f537ae
> --- /dev/null
> +++ b/lib/ipf.c
> @@ -0,0 +1,1528 @@
> +/*
> + * Copyright (c) 2019 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +#include <ctype.h>
> +#include <errno.h>
> +#include <sys/types.h>
> +#include <netinet/in.h>
> +#include <netinet/ip6.h>
> +#include <netinet/icmp6.h>
> +#include <string.h>
> +
> +#include "coverage.h"
> +#include "csum.h"
> +#include "ipf.h"
> +#include "latch.h"
> +#include "openvswitch/hmap.h"
> +#include "openvswitch/poll-loop.h"
> +#include "openvswitch/vlog.h"
> +#include "ovs-atomic.h"
> +#include "packets.h"
> +#include "util.h"
> +
> +VLOG_DEFINE_THIS_MODULE(ipf);
> +COVERAGE_DEFINE(ipf_stuck_frag_list_purged);
> +
> +enum {
> + IPV4_PACKET_MAX_HDR_SIZE = 60,
> + IPV4_PACKET_MAX_SIZE = 65535,
> + IPV6_PACKET_MAX_DATA = 65535,
> +};
> +
> +enum ipf_list_state {
> + IPF_LIST_STATE_UNUSED,
> + IPF_LIST_STATE_REASS_FAIL,
> + IPF_LIST_STATE_OTHER_SEEN,
> + IPF_LIST_STATE_FIRST_SEEN,
> + IPF_LIST_STATE_LAST_SEEN,
> + IPF_LIST_STATE_FIRST_LAST_SEEN,
> + IPF_LIST_STATE_COMPLETED,
> + IPF_LIST_STATE_NUM,
> +};
> +
> +static char *ipf_state_name[IPF_LIST_STATE_NUM] =
> + {"unused", "reassemble fail", "other frag", "first frag", "last frag",
> + "first/last frag", "complete"};
> +
> +enum ipf_list_type {
> + IPF_FRAG_COMPLETED_LIST,
> + IPF_FRAG_EXPIRY_LIST,
> +};
> +
> +enum {
> + IPF_INVALID_IDX = -1,
> + IPF_V4_FRAG_SIZE_LBOUND = 400,
> + IPF_V4_FRAG_SIZE_MIN_DEF = 1200,
> + IPF_V6_FRAG_SIZE_LBOUND = 400, /* Useful for testing. */
> + IPF_V6_FRAG_SIZE_MIN_DEF = 1280,
> + IPF_MAX_FRAGS_DEFAULT = 1000,
> + IPF_NFRAG_UBOUND = 5000,
> +};
> +
> +enum ipf_counter_type {
> + IPF_NFRAGS_ACCEPTED,
> + IPF_NFRAGS_COMPL_SENT,
> + IPF_NFRAGS_EXPD_SENT,
> + IPF_NFRAGS_TOO_SMALL,
> + IPF_NFRAGS_OVERLAP,
> + IPF_NFRAGS_PURGED,
> + IPF_NFRAGS_NUM_CNTS,
> +};
> +
> +union ipf_addr {
> + ovs_be32 ipv4;
> + struct in6_addr ipv6;
> +};
> +
> +/* Represents a single fragment; part of a list of fragments. */
> +struct ipf_frag {
> + struct dp_packet *pkt;
> + uint16_t start_data_byte;
> + uint16_t end_data_byte;
> + bool dnsteal; /* 'do not steal': if true, ipf should not free packet.
> */
> +};
> +
> +/* The key for a collection of fragments potentially making up an
> unfragmented
> + * packet. */
> +struct ipf_list_key {
> + /* ipf_list_key_hash() requires 'src_addr' and 'dst_addr' to be the
> first
> + * two members. */
> + union ipf_addr src_addr;
> + union ipf_addr dst_addr;
> + uint32_t recirc_id;
> + ovs_be32 ip_id; /* V6 is 32 bits. */
> + ovs_be16 dl_type;
> + uint16_t zone;
> + uint8_t nw_proto;
> +};
> +
> +/* A collection of fragments potentially making up an unfragmented
> packet. */
> +struct ipf_list {
> + struct hmap_node node; /* In struct ipf's 'frag_lists'. */
> + struct ovs_list list_node; /* In struct ipf's 'frag_exp_list' or
> + * 'frag_complete_list'. */
> + struct ipf_frag *frag_list; /* List of fragments for this list. */
> + struct ipf_list_key key; /* The key for the fragemnt list. */
> + struct dp_packet *reass_execute_ctx; /* Reassembled packet. */
> + long long expiration; /* In milliseconds. */
> + int last_sent_idx; /* Last sent fragment idx. */
> + int last_inuse_idx; /* Last inuse fragment idx. */
> + int size; /* Fragment list size. */
> + uint8_t state; /* Frag list state; see
> ipf_list_state. */
> +};
> +
> +/* Represents a reassambled packet which typically is passed through
> + * conntrack. */
> +struct reassembled_pkt {
> + struct ovs_list rp_list_node; /* In struct ipf's
> + * 'reassembled_pkt_list'. */
> + struct dp_packet *pkt;
> + struct ipf_list *list;
> +};
> +
> +struct ipf {
> + /* The clean thread is used to clean up fragments in the 'ipf'
> + * module if packet batches are not longer be sent through its user.
> */
> + pthread_t ipf_clean_thread;
> + struct latch ipf_clean_thread_exit;
> +
> + int max_v4_frag_list_size;
> +
> + struct ovs_mutex ipf_lock; /* Protects all of the following. */
> + /* These contain 'struct ipf_list's. */
> + struct hmap frag_lists OVS_GUARDED;
> + struct ovs_list frag_exp_list OVS_GUARDED;
> + struct ovs_list frag_complete_list OVS_GUARDED;
> + /* Contains 'struct reassembled_pkt's. */
> + struct ovs_list reassembled_pkt_list OVS_GUARDED;
> +
> + /* Used to allow disabling fragmentation reassembly. */
> + atomic_bool ifp_v4_enabled;
> + atomic_bool ifp_v6_enabled;
> +
> + /* Will be clamped above 400 bytes; the value chosen should handle
> + * alg control packets of interest that use string encoding of mutable
> + * IP fields; meaning, the control packets should not be fragmented.
> */
> + atomic_uint min_v4_frag_size;
> + atomic_uint min_v6_frag_size;
> +
> + /* Configurable maximum allowable fragments in process. */
> + atomic_uint nfrag_max;
> +
> + /* Number of fragments in process. */
> + atomic_count nfrag;
> +
> + atomic_uint64_t n4frag_cnt[IPF_NFRAGS_NUM_CNTS];
> + atomic_uint64_t n6frag_cnt[IPF_NFRAGS_NUM_CNTS];
> +};
> +
> +#define IPF_PTR(POINTER) \
> + CONST_CAST(struct ipf *, POINTER)
> +
> +static void
> +ipf_print_reass_packet(const char *es, const void *pkt)
> +{
> + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
> + if (!VLOG_DROP_WARN(&rl)) {
> + struct ds ds = DS_EMPTY_INITIALIZER;
> + ds_put_hex_dump(&ds, pkt, 128, 0, false);
> + VLOG_WARN("%s\n%s", es, ds_cstr(&ds));
> + ds_destroy(&ds);
> + }
> +}
> +
> +static void
> +ipf_count(struct ipf *ipf, bool v6, enum ipf_counter_type cntr)
> +{
> + atomic_count_inc64(v6 ? &ipf->n6frag_cnt[cntr] :
> &ipf->n4frag_cnt[cntr]);
> +}
> +
> +static bool
> +ipf_get_v4_enabled(struct ipf *ipf)
> +{
> + bool ifp_v4_enabled_;
> + atomic_read_relaxed(&ipf->ifp_v4_enabled, &ifp_v4_enabled_);
> + return ifp_v4_enabled_;
> +}
> +
> +static bool
> +ipf_get_v6_enabled(struct ipf *ipf)
> +{
> + bool ifp_v6_enabled_;
> + atomic_read_relaxed(&ipf->ifp_v6_enabled, &ifp_v6_enabled_);
> + return ifp_v6_enabled_;
> +}
> +
> +static bool
> +ipf_get_enabled(struct ipf *ipf)
> +{
> + return ipf_get_v4_enabled(ipf) || ipf_get_v6_enabled(ipf);
> +}
> +
> +static uint32_t
> +ipf_addr_hash_add(uint32_t hash, const union ipf_addr *addr)
> +{
> + BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
> + return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
> +}
> +
> +/* Adds a list of fragments to the list tracking expiry of yet to be
> + * completed reassembled packets, hence subject to expirty. */
> +static void
> +ipf_expiry_list_add(struct ovs_list *frag_exp_list, struct ipf_list
> *ipf_list,
> + long long now)
> + /* OVS_REQUIRES(ipf->ipf_lock) */
> +{
> + enum {
> + IPF_FRAG_LIST_TIMEOUT = 15000,
> + };
> +
> + ipf_list->expiration = now + IPF_FRAG_LIST_TIMEOUT;
> + ovs_list_push_back(frag_exp_list, &ipf_list->list_node);
> +}
> +
> +/* Adds a list of fragments to the list of completed packets, which will
> be
> + * subsequently transmitted. */
> +static void
> +ipf_completed_list_add(struct ovs_list *frag_complete_list,
> + struct ipf_list *ipf_list)
> + /* OVS_REQUIRES(ipf_lock) */
> +{
> + ovs_list_push_back(frag_complete_list, &ipf_list->list_node);
> +}
> +
> +/* Adds a reassmebled packet to the list of reassembled packets, awaiting
> some
> + * processing, such as being sent through conntrack. */
> +static void
> +ipf_reassembled_list_add(struct ovs_list *reassembled_pkt_list,
> + struct reassembled_pkt *rp)
> + /* OVS_REQUIRES(ipf_lock) */
> +{
> + ovs_list_push_back(reassembled_pkt_list, &rp->rp_list_node);
> +}
> +
> +/* Removed a frag list from tracking datastructures and frees list heap
> + * memory. */
> +static void
> +ipf_list_clean(struct hmap *frag_lists,
> + struct ipf_list *ipf_list)
> + /* OVS_REQUIRES(ipf_lock) */
> +{
> + ovs_list_remove(&ipf_list->list_node);
> + hmap_remove(frag_lists, &ipf_list->node);
> + free(ipf_list->frag_list);
> + free(ipf_list);
> +}
> +
> +/* Removed a frag list sitting on the expiry list from tracking
> + * datastructures and frees list heap memory. */
> +static void
> +ipf_expiry_list_clean(struct hmap *frag_lists,
> + struct ipf_list *ipf_list)
> + /* OVS_REQUIRES(ipf_lock) */
> +{
> + ipf_list_clean(frag_lists, ipf_list);
> +}
> +
> +/* Removed a frag list sitting on the completed list from tracking
> + * datastructures and frees list heap memory. */
> +static void
> +ipf_completed_list_clean(struct hmap *frag_lists,
> + struct ipf_list *ipf_list)
> + /* OVS_REQUIRES(ipf_lock) */
> +{
> + ipf_list_clean(frag_lists, ipf_list);
> +}
> +
> +static void
> +ipf_expiry_list_remove(struct ipf_list *ipf_list)
> + /* OVS_REQUIRES(ipf_lock) */
> +{
> + ovs_list_remove(&ipf_list->list_node);
> +}
> +
> +static void
> +ipf_reassembled_list_remove(struct reassembled_pkt *rp)
> + /* OVS_REQUIRES(ipf_lock) */
> +{
> + ovs_list_remove(&rp->rp_list_node);
> +}
> +
> +/* Symmetric */
> +static uint32_t
> +ipf_list_key_hash(const struct ipf_list_key *key, uint32_t basis)
> +{
> + uint32_t hsrc, hdst, hash;
> + hsrc = hdst = basis;
> + hsrc = ipf_addr_hash_add(hsrc, &key->src_addr);
> + hdst = ipf_addr_hash_add(hdst, &key->dst_addr);
> + hash = hsrc ^ hdst;
> +
> + /* Hash the rest of the key. */
> + return hash_words((uint32_t *) (&key->dst_addr + 1),
> + (uint32_t *) (key + 1) -
> + (uint32_t *) (&key->dst_addr + 1),
> + hash);
> +}
> +
> +static bool
> +ipf_is_first_v4_frag(const struct dp_packet *pkt)
> +{
> + const struct ip_header *l3 = dp_packet_l3(pkt);
> + if (!(l3->ip_frag_off & htons(IP_FRAG_OFF_MASK)) &&
> + l3->ip_frag_off & htons(IP_MORE_FRAGMENTS)) {
> + return true;
> + }
> + return false;
> +}
> +
> +static bool
> +ipf_is_last_v4_frag(const struct dp_packet *pkt)
> +{
> + const struct ip_header *l3 = dp_packet_l3(pkt);
> + if (l3->ip_frag_off & htons(IP_FRAG_OFF_MASK) &&
> + !(l3->ip_frag_off & htons(IP_MORE_FRAGMENTS))) {
> + return true;
> + }
> + return false;
> +}
> +
> +static bool
> +ipf_is_v6_frag(ovs_be16 ip6f_offlg)
> +{
> + if (ip6f_offlg & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) {
> + return true;
> + }
> + return false;
> +}
> +
> +static bool
> +ipf_is_first_v6_frag(ovs_be16 ip6f_offlg)
> +{
> + if (!(ip6f_offlg & IP6F_OFF_MASK) &&
> + ip6f_offlg & IP6F_MORE_FRAG) {
> + return true;
> + }
> + return false;
> +}
> +
> +static bool
> +ipf_is_last_v6_frag(ovs_be16 ip6f_offlg)
> +{
> + if ((ip6f_offlg & IP6F_OFF_MASK) &&
> + !(ip6f_offlg & IP6F_MORE_FRAG)) {
> + return true;
> + }
> + return false;
> +}
> +
> +/* Checks for a completed packet collection of fragments. */
> +static bool
> +ipf_list_complete(const struct ipf_list *ipf_list)
> + /* OVS_REQUIRES(ipf_lock) */
> +{
> + for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
> + if (ipf_list->frag_list[i - 1].end_data_byte + 1
> + != ipf_list->frag_list[i].start_data_byte) {
> + return false;
> + }
> + }
> + return true;
> +}
> +
> +/* Runs O(n) for a sorted or almost sorted list. */
> +static void
> +ipf_sort(struct ipf_frag *frag_list, size_t last_idx)
> + /* OVS_REQUIRES(ipf_lock) */
> +{
> + for (int li = 1; li <= last_idx; li++) {
> + struct ipf_frag ipf_frag = frag_list[li];
> + int ci = li - 1;
> + while (ci >= 0 &&
> + frag_list[ci].start_data_byte > ipf_frag.start_data_byte) {
> + frag_list[ci + 1] = frag_list[ci];
> + ci--;
> + }
> + frag_list[ci + 1] = ipf_frag;
> + }
> +}
> +
> +/* Called on a sorted complete list of v4 fragments to reassemble them
> into
> + * a single packet that can be processed, such as passing through
> conntrack.
> + */
> +static struct dp_packet *
> +ipf_reassemble_v4_frags(struct ipf_list *ipf_list)
> + /* OVS_REQUIRES(ipf_lock) */
> +{
> + struct ipf_frag *frag_list = ipf_list->frag_list;
> + struct dp_packet *pkt = dp_packet_clone(frag_list[0].pkt);
> + struct ip_header *l3 = dp_packet_l3(pkt);
> + int len = ntohs(l3->ip_tot_len);
> +
> + int rest_len = frag_list[ipf_list->last_inuse_idx].end_data_byte -
> + frag_list[1].start_data_byte + 1;
> +
> + if (len + rest_len > IPV4_PACKET_MAX_SIZE) {
> + ipf_print_reass_packet(
> + "Unsupported big reassembled v4 packet; v4 hdr:", l3);
> + dp_packet_delete(pkt);
> + return NULL;
> + }
> +
> + dp_packet_prealloc_tailroom(pkt, len + rest_len);
> +
> + for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
> + size_t add_len = frag_list[i].end_data_byte -
> + frag_list[i].start_data_byte + 1;
> + len += add_len;
> + const char *l4 = dp_packet_l4(frag_list[i].pkt);
> + dp_packet_put(pkt, l4, add_len);
> + }
> + l3 = dp_packet_l3(pkt);
> + ovs_be16 new_ip_frag_off = l3->ip_frag_off &
> ~htons(IP_MORE_FRAGMENTS);
> + l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_frag_off,
> + new_ip_frag_off);
> + l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_tot_len, htons(len));
> + l3->ip_tot_len = htons(len);
> + l3->ip_frag_off = new_ip_frag_off;
> + dp_packet_set_l2_pad_size(pkt, 0);
> +
> + return pkt;
> +}
> +
> +/* Called on a sorted complete list of v6 fragments to reassemble them
> into
> + * a single packet that can be processed, such as passing through
> conntrack.
> + */
> +static struct dp_packet *
> +ipf_reassemble_v6_frags(struct ipf_list *ipf_list)
> + /* OVS_REQUIRES(ipf_lock) */
> +{
> + struct ipf_frag *frag_list = ipf_list->frag_list;
> + struct dp_packet *pkt = dp_packet_clone(frag_list[0].pkt);
> + struct ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
> + int pl = ntohs(l3->ip6_plen) - sizeof(struct ovs_16aligned_ip6_frag);
> +
> + int rest_len = frag_list[ipf_list->last_inuse_idx].end_data_byte -
> + frag_list[1].start_data_byte + 1;
> +
> + if (pl + rest_len > IPV4_PACKET_MAX_SIZE) {
> + ipf_print_reass_packet(
> + "Unsupported big reassembled v6 packet; v6 hdr:", l3);
> + dp_packet_delete(pkt);
> + return NULL;
> + }
> +
> + dp_packet_prealloc_tailroom(pkt, pl + rest_len);
> +
> + for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
> + size_t add_len = frag_list[i].end_data_byte -
> + frag_list[i].start_data_byte + 1;
> + pl += add_len;
> + const char *l4 = dp_packet_l4(frag_list[i].pkt);
> + dp_packet_put(pkt, l4, add_len);
> + }
> +
> + l3 = dp_packet_l3(pkt);
> +
> + uint8_t nw_proto = l3->ip6_nxt;
> + uint8_t nw_frag = 0;
> + const void *data = l3 + 1;
> + size_t datasize = pl;
> +
> + const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
> + if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag,
> &frag_hdr)
> + || !nw_frag || !frag_hdr) {
> +
> + ipf_print_reass_packet("Unparsed reassembled v6 packet; v6 hdr:",
> l3);
> + dp_packet_delete(pkt);
> + return NULL;
> + }
> +
> + struct ovs_16aligned_ip6_frag *fh =
> + CONST_CAST(struct ovs_16aligned_ip6_frag *, frag_hdr);
> + fh->ip6f_offlg = 0;
> + l3->ip6_plen = htons(pl);
> + l3->ip6_ctlun.ip6_un1.ip6_un1_nxt = nw_proto;
> + dp_packet_set_l2_pad_size(pkt, 0);
> + return pkt;
> +}
> +
> +/* Called when a frag list state transitions to another state. This is
> + * triggered by new fragment for the list being received.*/
> +static void
> +ipf_list_state_transition(struct ipf *ipf, struct ipf_list *ipf_list,
> + bool ff, bool lf, bool v6)
> + OVS_REQUIRES(ipf->ipf_lock)
> +{
> + enum ipf_list_state curr_state = ipf_list->state;
> + enum ipf_list_state next_state;
> + switch (curr_state) {
> + case IPF_LIST_STATE_UNUSED:
> + case IPF_LIST_STATE_OTHER_SEEN:
> + if (ff) {
> + next_state = IPF_LIST_STATE_FIRST_SEEN;
> + } else if (lf) {
> + next_state = IPF_LIST_STATE_LAST_SEEN;
> + } else {
> + next_state = IPF_LIST_STATE_OTHER_SEEN;
> + }
> + break;
> + case IPF_LIST_STATE_FIRST_SEEN:
> + if (ff) {
> + next_state = IPF_LIST_STATE_FIRST_SEEN;
> + } else if (lf) {
> + next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
> + } else {
> + next_state = IPF_LIST_STATE_FIRST_SEEN;
> + }
> + break;
> + case IPF_LIST_STATE_LAST_SEEN:
> + if (ff) {
> + next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
> + } else if (lf) {
> + next_state = IPF_LIST_STATE_LAST_SEEN;
> + } else {
> + next_state = IPF_LIST_STATE_LAST_SEEN;
> + }
> + break;
> + case IPF_LIST_STATE_FIRST_LAST_SEEN:
> + next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
> + break;
> + case IPF_LIST_STATE_COMPLETED:
> + case IPF_LIST_STATE_REASS_FAIL:
> + case IPF_LIST_STATE_NUM:
> + default:
> + OVS_NOT_REACHED();
> + }
> +
> + if (next_state == IPF_LIST_STATE_FIRST_LAST_SEEN) {
> + ipf_sort(ipf_list->frag_list, ipf_list->last_inuse_idx);
> + if (ipf_list_complete(ipf_list)) {
> + struct dp_packet *reass_pkt = v6
> + ? ipf_reassemble_v6_frags(ipf_list)
> + : ipf_reassemble_v4_frags(ipf_list);
> + if (reass_pkt) {
> + struct reassembled_pkt *rp = xzalloc(sizeof *rp);
> + rp->pkt = reass_pkt;
> + rp->list = ipf_list;
> + ipf_reassembled_list_add(&ipf->reassembled_pkt_list, rp);
> + ipf_expiry_list_remove(ipf_list);
> + next_state = IPF_LIST_STATE_COMPLETED;
> + } else {
> + next_state = IPF_LIST_STATE_REASS_FAIL;
> + }
> + }
> + }
> + ipf_list->state = next_state;
> +}
> +
> +/* Some sanity checks are redundant, but prudent, in case code paths for
> + * fragments change in future. The processing cost for fragments is not
> + * important. */
> +static bool
> +ipf_is_valid_v4_frag(struct ipf *ipf, struct dp_packet *pkt)
> +{
> + if (OVS_UNLIKELY(dp_packet_ip_checksum_bad(pkt))) {
> + goto invalid_pkt;
> + }
> +
> + const struct eth_header *l2 = dp_packet_eth(pkt);
> + const struct ip_header *l3 = dp_packet_l3(pkt);
> +
> + if (OVS_UNLIKELY(!l2 || !l3)) {
> + goto invalid_pkt;
> + }
> +
> + size_t l3_size = dp_packet_l3_size(pkt);
> + if (OVS_UNLIKELY(l3_size < IP_HEADER_LEN)) {
> + goto invalid_pkt;
> + }
> +
> + if (!IP_IS_FRAGMENT(l3->ip_frag_off)) {
> + return false;
> + }
> +
> + uint16_t ip_tot_len = ntohs(l3->ip_tot_len);
> + if (OVS_UNLIKELY(ip_tot_len != l3_size)) {
> + goto invalid_pkt;
> + }
> +
> + size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4;
> + if (OVS_UNLIKELY(ip_hdr_len < IP_HEADER_LEN)) {
> + goto invalid_pkt;
> + }
> + if (OVS_UNLIKELY(l3_size < ip_hdr_len)) {
> + goto invalid_pkt;
> + }
> +
> + if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(pkt)
> + && csum(l3, ip_hdr_len) != 0)) {
> + goto invalid_pkt;
> + }
> +
> + uint32_t min_v4_frag_size_;
> + atomic_read_relaxed(&ipf->min_v4_frag_size, &min_v4_frag_size_);
> + bool lf = ipf_is_last_v4_frag(pkt);
> + if (OVS_UNLIKELY(!lf && dp_packet_size(pkt) < min_v4_frag_size_)) {
> + ipf_count(ipf, false, IPF_NFRAGS_TOO_SMALL);
> + goto invalid_pkt;
> + }
> + return true;
> +
> +invalid_pkt:
> + pkt->md.ct_state = CS_INVALID;
> + return false;
> +}
> +
> +static bool
> +ipf_v4_key_extract(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone,
> + struct ipf_list_key *key, uint16_t *start_data_byte,
> + uint16_t *end_data_byte, bool *ff, bool *lf)
> +{
> + const struct ip_header *l3 = dp_packet_l3(pkt);
> + uint16_t ip_tot_len = ntohs(l3->ip_tot_len);
> + size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4;
> +
> + *start_data_byte = ntohs(l3->ip_frag_off & htons(IP_FRAG_OFF_MASK)) *
> 8;
> + *end_data_byte = *start_data_byte + ip_tot_len - ip_hdr_len - 1;
> + *ff = ipf_is_first_v4_frag(pkt);
> + *lf = ipf_is_last_v4_frag(pkt);
> + memset(key, 0, sizeof *key);
> + key->ip_id = be16_to_be32(l3->ip_id);
> + key->dl_type = dl_type;
> + key->src_addr.ipv4 = get_16aligned_be32(&l3->ip_src);
> + key->dst_addr.ipv4 = get_16aligned_be32(&l3->ip_dst);
> + key->nw_proto = l3->ip_proto;
> + key->zone = zone;
> + key->recirc_id = pkt->md.recirc_id;
> + return true;
> +}
> +
> +/* Some sanity checks are redundant, but prudent, in case code paths for
> + * fragments change in future. The processing cost for fragments is not
> + * important. */
> +static bool
> +ipf_is_valid_v6_frag(struct ipf *ipf, struct dp_packet *pkt)
> +{
> + const struct eth_header *l2 = dp_packet_eth(pkt);
> + const struct ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
> + const char *l4 = dp_packet_l4(pkt);
> +
> + if (OVS_UNLIKELY(!l2 || !l3 || !l4)) {
> + goto invalid_pkt;
> + }
> +
> + size_t l3_size = dp_packet_l3_size(pkt);
> + size_t l3_hdr_size = sizeof *l3;
> +
> + if (OVS_UNLIKELY(l3_size < l3_hdr_size)) {
> + goto invalid_pkt;
> + }
> +
> + uint8_t nw_frag = 0;
> + uint8_t nw_proto = l3->ip6_nxt;
> + const void *data = l3 + 1;
> + size_t datasize = l3_size - l3_hdr_size;
> + const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
> + if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag,
> + &frag_hdr) || !nw_frag || !frag_hdr) {
> + return false;
> + }
> +
> + int pl = ntohs(l3->ip6_plen);
> + if (OVS_UNLIKELY(pl + l3_hdr_size != l3_size)) {
> + goto invalid_pkt;
> + }
> +
> + ovs_be16 ip6f_offlg = frag_hdr->ip6f_offlg;
> + if (OVS_UNLIKELY(!ipf_is_v6_frag(ip6f_offlg))) {
> + return false;
> + }
> +
> + uint32_t min_v6_frag_size_;
> + atomic_read_relaxed(&ipf->min_v6_frag_size, &min_v6_frag_size_);
> + bool lf = ipf_is_last_v6_frag(ip6f_offlg);
> +
> + if (OVS_UNLIKELY(!lf && dp_packet_size(pkt) < min_v6_frag_size_)) {
> + ipf_count(ipf, true, IPF_NFRAGS_TOO_SMALL);
> + goto invalid_pkt;
> + }
> +
> + return true;
> +
> +invalid_pkt:
> + pkt->md.ct_state = CS_INVALID;
> + return false;
> +
> +}
> +
> +static void
> +ipf_v6_key_extract(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone,
> + struct ipf_list_key *key, uint16_t *start_data_byte,
> + uint16_t *end_data_byte, bool *ff, bool *lf)
> +{
> + const struct ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
> + const char *l4 = dp_packet_l4(pkt);
> + const char *tail = dp_packet_tail(pkt);
> + uint8_t pad = dp_packet_l2_pad_size(pkt);
> + size_t l3_size = tail - (char *)l3 - pad;
> + size_t l4_size = tail - (char *)l4 - pad;
> + size_t l3_hdr_size = sizeof *l3;
> + uint8_t nw_frag = 0;
> + uint8_t nw_proto = l3->ip6_nxt;
> + const void *data = l3 + 1;
> + size_t datasize = l3_size - l3_hdr_size;
> + const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
> +
> + parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr);
> + ovs_assert(nw_frag && frag_hdr);
> + ovs_be16 ip6f_offlg = frag_hdr->ip6f_offlg;
> + *start_data_byte = ntohs(ip6f_offlg & IP6F_OFF_MASK) +
> + sizeof (struct ovs_16aligned_ip6_frag);
> + *end_data_byte = *start_data_byte + l4_size - 1;
> + *ff = ipf_is_first_v6_frag(ip6f_offlg);
> + *lf = ipf_is_last_v6_frag(ip6f_offlg);
> + memset(key, 0, sizeof *key);
> + key->ip_id = get_16aligned_be32(&frag_hdr->ip6f_ident);
> + key->dl_type = dl_type;
> + memcpy(&key->src_addr.ipv6, &l3->ip6_src, sizeof key->src_addr.ipv6);
> + /* We are not supporting parsing of the routing header to use as the
> + * dst address part of the key. */
> + memcpy(&key->dst_addr.ipv6, &l3->ip6_dst, sizeof key->dst_addr.ipv6);
> + key->nw_proto = 0; /* Not used for key for V6. */
> + key->zone = zone;
> + key->recirc_id = pkt->md.recirc_id;
> +}
> +
> +static bool
> +ipf_list_key_eq(const struct ipf_list_key *key1,
> + const struct ipf_list_key *key2)
> + /* OVS_REQUIRES(ipf_lock) */
> +{
> + if (!memcmp(&key1->src_addr, &key2->src_addr, sizeof key1->src_addr)
> &&
> + !memcmp(&key1->dst_addr, &key2->dst_addr, sizeof key1->dst_addr)
> &&
> + key1->dl_type == key2->dl_type &&
> + key1->ip_id == key2->ip_id &&
> + key1->zone == key2->zone &&
> + key1->nw_proto == key2->nw_proto &&
> + key1->recirc_id == key2->recirc_id) {
> + return true;
> + }
> + return false;
> +}
> +
> +static struct ipf_list *
> +ipf_list_key_lookup(struct ipf *ipf, const struct ipf_list_key *key,
> + uint32_t hash)
> + /* OVS_REQUIRES(ipf->ipf_lock) */
> +{
> + struct ipf_list *ipf_list;
> + HMAP_FOR_EACH_WITH_HASH (ipf_list, node, hash, &ipf->frag_lists) {
> + if (ipf_list_key_eq(&ipf_list->key, key)) {
> + return ipf_list;
> + }
> + }
> + return NULL;
> +}
> +
> +static bool
> +ipf_is_frag_duped(const struct ipf_frag *frag_list, int last_inuse_idx,
> + size_t start_data_byte, size_t end_data_byte)
> + /* OVS_REQUIRES(ipf_lock) */
> +{
> + for (int i = 0; i <= last_inuse_idx; i++) {
> + if ((start_data_byte >= frag_list[i].start_data_byte &&
> + start_data_byte <= frag_list[i].end_data_byte) ||
> + (end_data_byte >= frag_list[i].start_data_byte &&
> + end_data_byte <= frag_list[i].end_data_byte)) {
> + return true;
> + }
> + }
> + return false;
> +}
> +
> +/* Adds a fragment to a list of fragments, if the fragment is not a
> + * duplicate. If the fragment is a duplicate, that fragment is marked
> + * invalid to avoid the work that conntrack would do to mark the fragment
> + * as invalid, which it will in all cases. */
> +static bool
> +ipf_process_frag(struct ipf *ipf, struct ipf_list *ipf_list,
> + struct dp_packet *pkt, uint16_t start_data_byte,
> + uint16_t end_data_byte, bool ff, bool lf, bool v6,
> + bool dnsteal)
> + OVS_REQUIRES(ipf->ipf_lock)
> +{
> + bool duped_frag = ipf_is_frag_duped(ipf_list->frag_list,
> + ipf_list->last_inuse_idx, start_data_byte, end_data_byte);
> + int last_inuse_idx = ipf_list->last_inuse_idx;
> +
> + if (!duped_frag) {
> + if (last_inuse_idx < ipf_list->size - 1) {
> + /* In the case of dpdk, it would be unfortunate if we had
> + * to create a clone fragment outside the dpdk mp due to the
> + * mempool size being too limited. We will otherwise need to
> + * recommend not setting the mempool number of buffers too low
> + * and also clamp the number of fragments. */
> + struct ipf_frag *frag = &ipf_list->frag_list[last_inuse_idx +
> 1];
> + frag->pkt = pkt;
> + frag->start_data_byte = start_data_byte;
> + frag->end_data_byte = end_data_byte;
> + frag->dnsteal = dnsteal;
> + ipf_list->last_inuse_idx++;
> + atomic_count_inc(&ipf->nfrag);
> + ipf_count(ipf, v6, IPF_NFRAGS_ACCEPTED);
> + ipf_list_state_transition(ipf, ipf_list, ff, lf, v6);
> + } else {
> + OVS_NOT_REACHED();
> + }
> + } else {
> + ipf_count(ipf, v6, IPF_NFRAGS_OVERLAP);
> + pkt->md.ct_state = CS_INVALID;
> + return false;
> + }
> + return true;
> +}
> +
> +static void
> +ipf_list_init(struct ipf_list *ipf_list, struct ipf_list_key *key,
> + int max_frag_list_size)
> +{
> + ipf_list->key = *key;
> + ipf_list->last_inuse_idx = IPF_INVALID_IDX;
> + ipf_list->last_sent_idx = IPF_INVALID_IDX;
> + ipf_list->reass_execute_ctx = NULL;
> + ipf_list->state = IPF_LIST_STATE_UNUSED;
> + ipf_list->size = max_frag_list_size;
> + ipf_list->frag_list
> + = xzalloc(ipf_list->size * sizeof *ipf_list->frag_list);
> +}
> +
> +/* Generates a fragment list key from a well formed fragment and either
> starts
> + * a new fragment list or increases the size of the existing fragment
> list,
> + * while checking if the maximum supported fragements are supported or the
> + * list size is impossibly big. Calls 'ipf_process_frag()' to add a
> fragment
> + * to a list of fragemnts. */
> +static bool
> +ipf_handle_frag(struct ipf *ipf, struct dp_packet *pkt, ovs_be16 dl_type,
> + uint16_t zone, long long now, uint32_t hash_basis,
> + bool dnsteal)
> + OVS_REQUIRES(ipf->ipf_lock)
> +{
> + struct ipf_list_key key;
> + /* Initialize 4 variables for some versions of GCC. */
> + uint16_t start_data_byte = 0;
> + uint16_t end_data_byte = 0;
> + bool ff = false;
> + bool lf = false;
> + bool v6 = dl_type == htons(ETH_TYPE_IPV6);
> +
> + if (v6 && ipf_get_v6_enabled(ipf)) {
> + ipf_v6_key_extract(pkt, dl_type, zone, &key, &start_data_byte,
> + &end_data_byte, &ff, &lf);
> + } else if (!v6 && ipf_get_v4_enabled(ipf)) {
> + ipf_v4_key_extract(pkt, dl_type, zone, &key, &start_data_byte,
> + &end_data_byte, &ff, &lf);
> + } else {
> + return false;
> + }
> +
> + unsigned int nfrag_max;
> + atomic_read_relaxed(&ipf->nfrag_max, &nfrag_max);
> + if (atomic_count_get(&ipf->nfrag) >= nfrag_max) {
> + return false;
> + }
> +
> + uint32_t hash = ipf_list_key_hash(&key, hash_basis);
> + struct ipf_list *ipf_list = ipf_list_key_lookup(ipf, &key, hash);
> + enum {
> + IPF_FRAG_LIST_MIN_INCREMENT = 4,
> + IPF_IPV6_MAX_FRAG_LIST_SIZE = 65535,
> + };
> +
> + int max_frag_list_size;
> + if (v6) {
> + /* Because the calculation with extension headers is variable,
> + * we don't calculate a hard maximum fragment list size upfront.
> The
> + * fragment list size is practically limited by the code,
> however. */
> + max_frag_list_size = IPF_IPV6_MAX_FRAG_LIST_SIZE;
> + } else {
> + max_frag_list_size = ipf->max_v4_frag_list_size;
> + }
> +
> + if (!ipf_list) {
> + ipf_list = xmalloc(sizeof *ipf_list);
> + ipf_list_init(ipf_list, &key,
> + MIN(max_frag_list_size,
> IPF_FRAG_LIST_MIN_INCREMENT));
> + hmap_insert(&ipf->frag_lists, &ipf_list->node, hash);
> + ipf_expiry_list_add(&ipf->frag_exp_list, ipf_list, now);
> + } else if (ipf_list->state == IPF_LIST_STATE_REASS_FAIL) {
> + /* Bail out as early as possible. */
> + return false;
> + } else if (ipf_list->last_inuse_idx + 1 >= ipf_list->size) {
> + int increment = MIN(IPF_FRAG_LIST_MIN_INCREMENT,
> + max_frag_list_size - ipf_list->size);
> + /* Enforce limit. */
> + if (increment > 0) {
> + ipf_list->frag_list =
> + xrealloc(ipf_list->frag_list, (ipf_list->size +
> increment) *
> + sizeof *ipf_list->frag_list);
> + ipf_list->size += increment;
> + } else {
> + return false;
> + }
> + }
> +
> + return ipf_process_frag(ipf, ipf_list, pkt, start_data_byte,
> + end_data_byte, ff, lf, v6, dnsteal);
> +}
> +
> +/* Filters out fragments from a batch of fragments and adjust the batch.
> */
> +static void
> +ipf_extract_frags_from_batch(struct ipf *ipf, struct dp_packet_batch *pb,
> + ovs_be16 dl_type, uint16_t zone, long long
> now,
> + uint32_t hash_basis)
> +{
> + const size_t pb_cnt = dp_packet_batch_size(pb);
> + int pb_idx; /* Index in a packet batch. */
> + struct dp_packet *pkt;
> +
> + DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
> + if (OVS_UNLIKELY((dl_type == htons(ETH_TYPE_IP) &&
> + ipf_is_valid_v4_frag(ipf, pkt))
> + ||
> + (dl_type == htons(ETH_TYPE_IPV6) &&
> + ipf_is_valid_v6_frag(ipf, pkt)))) {
> +
> + ovs_mutex_lock(&ipf->ipf_lock);
> + if (!ipf_handle_frag(ipf, pkt, dl_type, zone, now, hash_basis,
> + pb->do_not_steal)) {
> + dp_packet_batch_refill(pb, pkt, pb_idx);
> + }
> + ovs_mutex_unlock(&ipf->ipf_lock);
> + } else {
> + dp_packet_batch_refill(pb, pkt, pb_idx);
> + }
> + }
> +}
> +
> +/* In case of DPDK, a memory source check is done, as DPDK memory pool
> + * management has trouble dealing with multiple source types. The
> + * check_source paramater is used to indicate when this check is needed.
> */
> +static bool
> +ipf_dp_packet_batch_add(struct dp_packet_batch *pb , struct dp_packet
> *pkt,
> + bool check_source OVS_UNUSED)
> +{
> +#ifdef DPDK_NETDEV
> + if ((dp_packet_batch_is_full(pb)) ||
> + /* DPDK cannot handle multiple sources in a batch. */
> + (check_source && !dp_packet_batch_is_empty(pb)
> + && pb->packets[0]->source != pkt->source)) {
> +#else
> + if (dp_packet_batch_is_full(pb)) {
> +#endif
> + return false;
> + }
> +
> + dp_packet_batch_add(pb, pkt);
> + return true;
> +}
> +
> +/* This would be used in rare cases where a list cannot be sent. One rare
> + * reason known right now is a mempool source check, which exists due to
> DPDK
> + * support, where packets are no longer being received on any port with a
> + * source matching the fragment. Another reason is a race where all
> + * conntrack rules are unconfigured when some fragments are yet to be
> + * flushed.
> + *
> + * Returns true if the list was purged. */
> +static bool
> +ipf_purge_list_check(struct ipf *ipf, struct ipf_list *ipf_list,
> + long long now)
> + OVS_REQUIRES(ipf->ipf_lock)
> +{
> + enum {
> + IPF_FRAG_LIST_PURGE_TIME_ADJ = 10000
> + };
> +
> + if (now < ipf_list->expiration + IPF_FRAG_LIST_PURGE_TIME_ADJ) {
> + return false;
> + }
> +
> + while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
> + struct dp_packet * pkt
> + = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
> + dp_packet_delete(pkt);
> + atomic_count_dec(&ipf->nfrag);
> + COVERAGE_INC(ipf_stuck_frag_list_purged);
> + ipf_count(ipf, ipf_list->key.dl_type == htons(ETH_TYPE_IPV6),
> + IPF_NFRAGS_PURGED);
> + ipf_list->last_sent_idx++;
> + }
> +
> + return true;
> +}
> +
> +/* Does the packet batch management and common accounting work associated
> + * with 'ipf_send_completed_frags()' and 'ipf_send_expired_frags()'. */
> +static bool
> +ipf_send_frags_in_list(struct ipf *ipf, struct ipf_list *ipf_list,
> + struct dp_packet_batch *pb,
> + enum ipf_list_type list_type, bool v6, long long
> now)
> + OVS_REQUIRES(ipf->ipf_lock)
> +{
> + if (ipf_purge_list_check(ipf, ipf_list, now)) {
> + return true;
> + }
> +
> + while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
> + struct dp_packet *pkt
> + = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
> + if (ipf_dp_packet_batch_add(pb, pkt, true)) {
> + ipf_list->last_sent_idx++;
> + atomic_count_dec(&ipf->nfrag);
> +
> + if (list_type == IPF_FRAG_COMPLETED_LIST) {
> + ipf_count(ipf, v6, IPF_NFRAGS_COMPL_SENT);
> + } else {
> + ipf_count(ipf, v6, IPF_NFRAGS_EXPD_SENT);
> + pkt->md.ct_state = CS_INVALID;
> + }
> +
> + if (ipf_list->last_sent_idx == ipf_list->last_inuse_idx) {
> + return true;
> + }
> + } else {
> + return false;
> + }
> + }
> + OVS_NOT_REACHED();
> +}
> +
> +/* Adds fragments associated with a completed fragment list to a packet
> batch
> + * to be processed by the calling application, typically conntrack. Also
> + * cleans up the list context when it is empty.*/
> +static void
> +ipf_send_completed_frags(struct ipf *ipf, struct dp_packet_batch *pb,
> + long long now, bool v6)
> +{
> + if (ovs_list_is_empty(&ipf->frag_complete_list)) {
> + return;
> + }
> +
> + ovs_mutex_lock(&ipf->ipf_lock);
> + struct ipf_list *ipf_list, *next;
> +
> + LIST_FOR_EACH_SAFE (ipf_list, next, list_node,
> &ipf->frag_complete_list) {
> + if (ipf_send_frags_in_list(ipf, ipf_list, pb,
> IPF_FRAG_COMPLETED_LIST,
> + v6, now)) {
> + ipf_completed_list_clean(&ipf->frag_lists, ipf_list);
> + } else {
> + break;
> + }
> + }
> +
> + ovs_mutex_unlock(&ipf->ipf_lock);
> +}
> +
> +/* Conservatively adds fragments associated with a expired fragment list
> to
> + * a packet batch to be processed by the calling application, typically
> + * conntrack. Also cleans up the list context when it is empty.*/
> +static void
> +ipf_send_expired_frags(struct ipf *ipf, struct dp_packet_batch *pb,
> + long long now, bool v6)
> +{
> + enum {
> + /* Very conservative, due to DOS probability. */
> + IPF_FRAG_LIST_MAX_EXPIRED = 1,
> + };
> +
> +
> + if (ovs_list_is_empty(&ipf->frag_exp_list)) {
> + return;
> + }
> +
> + ovs_mutex_lock(&ipf->ipf_lock);
> + struct ipf_list *ipf_list, *next;
> + size_t lists_removed = 0;
> +
> + LIST_FOR_EACH_SAFE (ipf_list, next, list_node, &ipf->frag_exp_list) {
> + if (now <= ipf_list->expiration ||
> + lists_removed >= IPF_FRAG_LIST_MAX_EXPIRED) {
> + break;
> + }
> +
> + if (ipf_send_frags_in_list(ipf, ipf_list, pb,
> IPF_FRAG_EXPIRY_LIST,
> + v6, now)) {
> + ipf_expiry_list_clean(&ipf->frag_lists, ipf_list);
> + lists_removed++;
> + } else {
> + break;
> + }
> + }
> +
> + ovs_mutex_unlock(&ipf->ipf_lock);
> +}
> +
> +/* Adds a reassmebled packet to a packet batch to be processed by the
> caller.
> + */
> +static void
> +ipf_execute_reass_pkts(struct ipf *ipf, struct dp_packet_batch *pb)
> +{
> + if (ovs_list_is_empty(&ipf->reassembled_pkt_list)) {
> + return;
> + }
> +
> + ovs_mutex_lock(&ipf->ipf_lock);
> + struct reassembled_pkt *rp, *next;
> +
> + LIST_FOR_EACH_SAFE (rp, next, rp_list_node,
> &ipf->reassembled_pkt_list) {
> + if (!rp->list->reass_execute_ctx &&
> + ipf_dp_packet_batch_add(pb, rp->pkt, false)) {
> + rp->list->reass_execute_ctx = rp->pkt;
> + }
> + }
> +
> + ovs_mutex_unlock(&ipf->ipf_lock);
> +}
> +
> +/* Checks for reassembled packets post processing by conntrack and edits
> the
> + * fragments if needed based on what conntrack decided. */
> +static void
> +ipf_post_execute_reass_pkts(struct ipf *ipf,
> + struct dp_packet_batch *pb, bool v6)
> +{
> + if (ovs_list_is_empty(&ipf->reassembled_pkt_list)) {
> + return;
> + }
> +
> + ovs_mutex_lock(&ipf->ipf_lock);
> + struct reassembled_pkt *rp, *next;
> +
> + LIST_FOR_EACH_SAFE (rp, next, rp_list_node,
> &ipf->reassembled_pkt_list) {
> + const size_t pb_cnt = dp_packet_batch_size(pb);
> + int pb_idx;
> + struct dp_packet *pkt;
> + /* Inner batch loop is constant time since batch size is <=
> + * NETDEV_MAX_BURST. */
> + DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
> + if (pkt == rp->list->reass_execute_ctx) {
> + for (int i = 0; i <= rp->list->last_inuse_idx; i++) {
> + rp->list->frag_list[i].pkt->md.ct_label =
> pkt->md.ct_label;
> + rp->list->frag_list[i].pkt->md.ct_mark =
> pkt->md.ct_mark;
> + rp->list->frag_list[i].pkt->md.ct_state =
> pkt->md.ct_state;
> + rp->list->frag_list[i].pkt->md.ct_zone =
> pkt->md.ct_zone;
> + rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 =
> + pkt->md.ct_orig_tuple_ipv6;
> + if (pkt->md.ct_orig_tuple_ipv6) {
> + rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv6
> =
> + pkt->md.ct_orig_tuple.ipv6;
> + } else {
> +
> rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv4 =
> + pkt->md.ct_orig_tuple.ipv4;
> + }
> + }
> +
> + const struct ipf_frag *frag_0 = &rp->list->frag_list[0];
> + const char *tail_frag = dp_packet_tail(frag_0->pkt);
> + uint8_t pad_frag = dp_packet_l2_pad_size(frag_0->pkt);
> + void *l4_frag = dp_packet_l4(frag_0->pkt);
> + void *l4_reass = dp_packet_l4(pkt);
> + memcpy(l4_frag, l4_reass,
> + tail_frag - (char *) l4_frag - pad_frag);
> +
> + if (v6) {
> + struct ovs_16aligned_ip6_hdr *l3_frag
> + = dp_packet_l3(frag_0->pkt);
> + struct ovs_16aligned_ip6_hdr *l3_reass =
> dp_packet_l3(pkt);
> + l3_frag->ip6_src = l3_reass->ip6_src;
> + l3_frag->ip6_dst = l3_reass->ip6_dst;
> + } else {
> + struct ip_header *l3_frag = dp_packet_l3(frag_0->pkt);
> + struct ip_header *l3_reass = dp_packet_l3(pkt);
> + ovs_be32 reass_ip =
> get_16aligned_be32(&l3_reass->ip_src);
> + ovs_be32 frag_ip =
> get_16aligned_be32(&l3_frag->ip_src);
> + l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
> + frag_ip, reass_ip);
> + l3_frag->ip_src = l3_reass->ip_src;
> +
> + reass_ip = get_16aligned_be32(&l3_reass->ip_dst);
> + frag_ip = get_16aligned_be32(&l3_frag->ip_dst);
> + l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
> + frag_ip, reass_ip);
> + l3_frag->ip_dst = l3_reass->ip_dst;
> + }
> +
> + ipf_completed_list_add(&ipf->frag_complete_list,
> rp->list);
> + ipf_reassembled_list_remove(rp);
> + dp_packet_delete(rp->pkt);
> + free(rp);
> + } else {
> + dp_packet_batch_refill(pb, pkt, pb_idx);
> + }
> + }
> + }
> +
> + ovs_mutex_unlock(&ipf->ipf_lock);
> +}
> +
> +/* Extracts any fragments from the batch and reassembles them when a
> + * complete packet is received. Completed packets are attempted to
> + * be added to the batch to be sent through conntrack. */
> +void
> +ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
> + long long now, ovs_be16 dl_type, uint16_t zone,
> + uint32_t hash_basis)
> +{
> + if (ipf_get_enabled(ipf)) {
> + ipf_extract_frags_from_batch(ipf, pb, dl_type, zone, now,
> hash_basis);
> + }
> +
> + if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) {
> + ipf_execute_reass_pkts(ipf, pb);
> + }
> +}
> +
> +/* Updates fragments based on the processing of the reassembled packet
> sent
> + * through conntrack and adds these fragments to any batches seen.
> Expired
> + * fragments are marked as invalid and also added to the batches seen
> + * with low priority. Reassembled packets are freed. */
> +void
> +ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
> + long long now, ovs_be16 dl_type)
> +{
> + if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) {
> + bool v6 = dl_type == htons(ETH_TYPE_IPV6);
> + ipf_post_execute_reass_pkts(ipf, pb, v6);
> + ipf_send_completed_frags(ipf, pb, now, v6);
> + ipf_send_expired_frags(ipf, pb, now, v6);
> + }
> +}
> +
> +static void *
> +ipf_clean_thread_main(void *f)
> +{
> + struct ipf *ipf = f;
> +
> + enum {
> + IPF_FRAG_LIST_CLEAN_TIMEOUT = 60000,
> + };
> +
> + while (!latch_is_set(&ipf->ipf_clean_thread_exit)) {
> +
> + long long now = time_msec();
> +
> + if (!ovs_list_is_empty(&ipf->frag_exp_list) ||
> + !ovs_list_is_empty(&ipf->frag_complete_list)) {
> +
> + ovs_mutex_lock(&ipf->ipf_lock);
> +
> + struct ipf_list *ipf_list, *next;
> + LIST_FOR_EACH_SAFE (ipf_list, next, list_node,
> + &ipf->frag_exp_list) {
> + if (ipf_purge_list_check(ipf, ipf_list, now)) {
> + ipf_expiry_list_clean(&ipf->frag_lists, ipf_list);
> + }
> + }
> +
> + LIST_FOR_EACH_SAFE (ipf_list, next, list_node,
> + &ipf->frag_complete_list) {
> + if (ipf_purge_list_check(ipf, ipf_list, now)) {
> + ipf_completed_list_clean(&ipf->frag_lists, ipf_list);
> + }
> + }
> +
> + ovs_mutex_unlock(&ipf->ipf_lock);
> + }
> +
> + poll_timer_wait_until(now + IPF_FRAG_LIST_CLEAN_TIMEOUT);
> + latch_wait(&ipf->ipf_clean_thread_exit);
> + poll_block();
> + }
> +
> + return NULL;
> +}
> +
> +struct ipf *
> +ipf_init(void)
> +{
> + struct ipf *ipf = xzalloc(sizeof *ipf);
> +
> + ovs_mutex_init_adaptive(&ipf->ipf_lock);
> + ovs_mutex_lock(&ipf->ipf_lock);
> + hmap_init(&ipf->frag_lists);
> + ovs_list_init(&ipf->frag_exp_list);
> + ovs_list_init(&ipf->frag_complete_list);
> + ovs_list_init(&ipf->reassembled_pkt_list);
> + atomic_init(&ipf->min_v4_frag_size, IPF_V4_FRAG_SIZE_MIN_DEF);
> + atomic_init(&ipf->min_v6_frag_size, IPF_V6_FRAG_SIZE_MIN_DEF);
> + ipf->max_v4_frag_list_size = DIV_ROUND_UP(
> + IPV4_PACKET_MAX_SIZE - IPV4_PACKET_MAX_HDR_SIZE,
> + ipf->min_v4_frag_size - IPV4_PACKET_MAX_HDR_SIZE);
> + ovs_mutex_unlock(&ipf->ipf_lock);
> + atomic_count_init(&ipf->nfrag, 0);
> + for (size_t i = 0; i < IPF_NFRAGS_NUM_CNTS; i++) {
> + atomic_init(&ipf->n4frag_cnt[i], 0);
> + atomic_init(&ipf->n6frag_cnt[i], 0);
> + }
> + atomic_init(&ipf->nfrag_max, IPF_MAX_FRAGS_DEFAULT);
> + atomic_init(&ipf->ifp_v4_enabled, true);
> + atomic_init(&ipf->ifp_v6_enabled, true);
> + latch_init(&ipf->ipf_clean_thread_exit);
> + ipf->ipf_clean_thread = ovs_thread_create("ipf_clean",
> + ipf_clean_thread_main, ipf);
> +
> + return ipf;
> +}
> +
> +void
> +ipf_destroy(struct ipf *ipf)
> +{
> + ovs_mutex_lock(&ipf->ipf_lock);
> + latch_set(&ipf->ipf_clean_thread_exit);
> + pthread_join(ipf->ipf_clean_thread, NULL);
> + latch_destroy(&ipf->ipf_clean_thread_exit);
> +
> + struct ipf_list *ipf_list;
> + HMAP_FOR_EACH_POP (ipf_list, node, &ipf->frag_lists) {
> + while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
> + struct dp_packet *pkt
> + = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
> + if (!ipf_list->frag_list[ipf_list->last_sent_idx +
> 1].dnsteal) {
> + dp_packet_delete(pkt);
> + }
> + atomic_count_dec(&ipf->nfrag);
> + ipf_list->last_sent_idx++;
> + }
> + free(ipf_list->frag_list);
> + free(ipf_list);
> + }
> +
> + if (atomic_count_get(&ipf->nfrag)) {
> + VLOG_WARN("ipf destroy with non-zero fragment count. ");
> + }
> +
> + struct reassembled_pkt *rp;
> + LIST_FOR_EACH_POP (rp, rp_list_node, &ipf->reassembled_pkt_list) {
> + dp_packet_delete(rp->pkt);
> + free(rp);
> + }
> +
> + hmap_destroy(&ipf->frag_lists);
> + ovs_list_poison(&ipf->frag_exp_list);
> + ovs_list_poison(&ipf->frag_complete_list);
> + ovs_list_poison(&ipf->reassembled_pkt_list);
> + ovs_mutex_unlock(&ipf->ipf_lock);
> + ovs_mutex_destroy(&ipf->ipf_lock);
> + free(ipf);
> +}
> +
> +int
> +ipf_set_enabled(struct ipf *ipf, bool v6, bool enable)
> +{
> + atomic_store_relaxed(v6 ? &ipf->ifp_v6_enabled : &ipf->ifp_v4_enabled,
> + enable);
> + return 0;
> +}
> +
> +int
> +ipf_set_min_frag(struct ipf *ipf, bool v6, uint32_t value)
> +{
> + /* If the user specifies an unreasonably large number, fragmentation
> + * will not work well but it will not blow up. */
> + if (value < (v6 ? IPF_V6_FRAG_SIZE_LBOUND :
> IPF_V4_FRAG_SIZE_LBOUND)) {
> + return 1;
> + }
> +
> + ovs_mutex_lock(&ipf->ipf_lock);
> + if (v6) {
> + atomic_store_relaxed(&ipf->min_v6_frag_size, value);
> + } else {
> + atomic_store_relaxed(&ipf->min_v4_frag_size, value);
> + ipf->max_v4_frag_list_size = DIV_ROUND_UP(
> + IPV4_PACKET_MAX_SIZE - IPV4_PACKET_MAX_HDR_SIZE,
> + ipf->min_v4_frag_size - IPV4_PACKET_MAX_HDR_SIZE);
> + }
> + ovs_mutex_unlock(&ipf->ipf_lock);
> + return 0;
> +}
> +
> +int
> +ipf_set_max_nfrags(struct ipf *ipf, uint32_t value)
> +{
> + if (value > IPF_NFRAG_UBOUND) {
> + return 1;
> + }
> + atomic_store_relaxed(&ipf->nfrag_max, value);
> + return 0;
> +}
> +
> +int
> +ipf_get_status(struct ipf *ipf, struct ipf_status *ipf_status)
> +{
> + ipf_status->nfrag = atomic_count_get(&ipf->nfrag);
> + atomic_read_relaxed(&ipf->nfrag_max, &ipf_status->nfrag_max);
> +
> + atomic_read_relaxed(&ipf->ifp_v4_enabled, &ipf_status->v4.enabled);
> + atomic_read_relaxed(&ipf->min_v4_frag_size,
> + &ipf_status->v4.min_frag_size);
> + atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_ACCEPTED],
> + &ipf_status->v4.nfrag_accepted);
> + atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_COMPL_SENT],
> + &ipf_status->v4.nfrag_completed_sent);
> + atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_EXPD_SENT],
> + &ipf_status->v4.nfrag_expired_sent);
> + atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_TOO_SMALL],
> + &ipf_status->v4.nfrag_too_small);
> + atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_OVERLAP],
> + &ipf_status->v4.nfrag_overlap);
> + atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_PURGED],
> + &ipf_status->v4.nfrag_purged);
> +
> + atomic_read_relaxed(&ipf->ifp_v6_enabled, &ipf_status->v6.enabled);
> + atomic_read_relaxed(&ipf->min_v6_frag_size,
> + &ipf_status->v6.min_frag_size);
> + atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_ACCEPTED],
> + &ipf_status->v6.nfrag_accepted);
> + atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_COMPL_SENT],
> + &ipf_status->v6.nfrag_completed_sent);
> + atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_EXPD_SENT],
> + &ipf_status->v6.nfrag_expired_sent);
> + atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_TOO_SMALL],
> + &ipf_status->v6.nfrag_too_small);
> + atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_OVERLAP],
> + &ipf_status->v6.nfrag_overlap);
> + atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_PURGED],
> + &ipf_status->v6.nfrag_purged);
> + return 0;
> +}
> +
> +struct ipf_dump_ctx {
> + struct hmap_position bucket_pos;
> +};
> +
> +/* Allocates an 'ipf_dump_ctx' to keep track of an hmap position. The
> + * caller must call ipf_dump_done() when dumping is finished. */
> +int
> +ipf_dump_start(struct ipf_dump_ctx **ipf_dump_ctx)
> +{
> + *ipf_dump_ctx = xzalloc(sizeof **ipf_dump_ctx);
> + return 0;
> +}
> +
> +/* Creates a string representation of the state of an 'ipf_list' and puts
> + * it in 'ds'. */
> +static void
> +ipf_dump_create(const struct ipf_list *ipf_list, struct ds *ds)
> +{
> + ds_put_cstr(ds, "(");
> + if (ipf_list->key.dl_type == htons(ETH_TYPE_IP)) {
> + ds_put_format(ds, "src="IP_FMT",dst="IP_FMT",",
> + IP_ARGS(ipf_list->key.src_addr.ipv4),
> + IP_ARGS(ipf_list->key.dst_addr.ipv4));
> + } else {
> + ds_put_cstr(ds, "src=");
> + ipv6_format_addr(&ipf_list->key.src_addr.ipv6, ds);
> + ds_put_cstr(ds, ",dst=");
> + ipv6_format_addr(&ipf_list->key.dst_addr.ipv6, ds);
> + ds_put_cstr(ds, ",");
> + }
> +
> + ds_put_format(ds,
> "recirc_id=%u,ip_id=%u,dl_type=0x%x,zone=%u,nw_proto=%u",
> + ipf_list->key.recirc_id, ntohl(ipf_list->key.ip_id),
> + ntohs(ipf_list->key.dl_type), ipf_list->key.zone,
> + ipf_list->key.nw_proto);
> +
> + ds_put_format(ds, ",num_fragments=%u,state=%s",
> + ipf_list->last_inuse_idx + 1,
> + ipf_state_name[ipf_list->state]);
> +
> + ds_put_cstr(ds, ")");
> +}
> +
> +/* Finds the next ipf list starting from 'ipf_dump_ctx->bucket_pos' and
> uses
> + * ipf_dump_create() to create a string representation of the state of an
> + * ipf list, to which 'dump' is pointed to. Returns EOF when there are no
> + * more ipf lists. */
> +int
> +ipf_dump_next(struct ipf *ipf, struct ipf_dump_ctx *ipf_dump_ctx, char
> **dump)
> +{
> + ovs_mutex_lock(&ipf->ipf_lock);
> +
> + struct hmap_node *node = hmap_at_position(&ipf->frag_lists,
> + &ipf_dump_ctx->bucket_pos);
> + if (!node) {
> + ovs_mutex_unlock(&ipf->ipf_lock);
> + return EOF;
> + } else {
> + struct ipf_list *ipf_list_;
> + INIT_CONTAINER(ipf_list_, node, node);
> + struct ipf_list ipf_list = *ipf_list_;
> + ovs_mutex_unlock(&ipf->ipf_lock);
> + struct ds ds = DS_EMPTY_INITIALIZER;
> + ipf_dump_create(&ipf_list, &ds);
> + *dump = ds_steal_cstr(&ds);
> + return 0;
> + }
> +}
> +
> +/* Frees 'ipf_dump_ctx' allocated by ipf_dump_start(). */
> +int
> +ipf_dump_done(struct ipf_dump_ctx *ipf_dump_ctx)
> +{
> + free(ipf_dump_ctx);
> + return 0;
> +}
> diff --git a/lib/ipf.h b/lib/ipf.h
> new file mode 100644
> index 0000000..6ac91b2
> --- /dev/null
> +++ b/lib/ipf.h
> @@ -0,0 +1,63 @@
> +/*
> + * Copyright (c) 2019 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef IPF_H
> +#define IPF_H 1
> +
> +#include "dp-packet.h"
> +#include "openvswitch/types.h"
> +
> +struct ipf;
> +
> +struct ipf_proto_status {
> + uint64_t nfrag_accepted;
> + uint64_t nfrag_completed_sent;
> + uint64_t nfrag_expired_sent;
> + uint64_t nfrag_too_small;
> + uint64_t nfrag_overlap;
> + uint64_t nfrag_purged;
> + unsigned int min_frag_size;
> + bool enabled;
> +};
> +
> +struct ipf_status {
> + struct ipf_proto_status v4;
> + struct ipf_proto_status v6;
> + unsigned int nfrag;
> + unsigned int nfrag_max;
> +};
> +
> +struct ipf *ipf_init(void);
> +void ipf_destroy(struct ipf *ipf);
> +void ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
> + long long now, ovs_be16 dl_type, uint16_t
> zone,
> + uint32_t hash_basis);
> +
> +void ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch
> *pb,
> + long long now, ovs_be16 dl_type);
> +
> +int ipf_set_enabled(struct ipf *ipf, bool v6, bool enable);
> +int ipf_set_min_frag(struct ipf *ipf, bool v6, uint32_t value);
> +int ipf_set_max_nfrags(struct ipf *ipf, uint32_t value);
> +int ipf_get_status(struct ipf *ipf, struct ipf_status *ipf_status);
> +
> +struct ipf_dump_ctx;
> +int ipf_dump_start(struct ipf_dump_ctx **ipf_dump_ctx);
> +int ipf_dump_next(struct ipf *ipf, struct ipf_dump_ctx *ipf_dump_ctx,
> + char **dump);
> +int ipf_dump_done(struct ipf_dump_ctx *ipf_dump_ctx);
> +
> +#endif /* ipf.h */
> diff --git a/tests/system-kmod-macros.at b/tests/system-kmod-macros.at
> index 3296d64..1057e34 100644
> --- a/tests/system-kmod-macros.at
> +++ b/tests/system-kmod-macros.at
> @@ -77,12 +77,6 @@ m4_define([CHECK_CONNTRACK],
> #
> m4_define([CHECK_CONNTRACK_ALG])
>
> -# CHECK_CONNTRACK_FRAG()
> -#
> -# Perform requirements checks for running conntrack fragmentations tests.
> -# The kernel always supports fragmentation, so no check is needed.
> -m4_define([CHECK_CONNTRACK_FRAG])
> -
> # CHECK_CONNTRACK_LOCAL_STACK()
> #
> # Perform requirements checks for running conntrack tests with local
> stack.
> @@ -140,6 +134,46 @@ m4_define([CHECK_CT_DPIF_GET_NCONNS],
> AT_SKIP_IF([:])
> ])
>
> +# DPCTL_SET_MIN_FRAG_SIZE()
> +#
> +# The kernel does not support this command.
> +m4_define([DPCTL_SET_MIN_FRAG_SIZE],
> +[
> +
> +])
> +
> +# DPCTL_MODIFY_FRAGMENTATION()
> +#
> +# The kernel does not support this command.
> +m4_define([DPCTL_MODIFY_FRAGMENTATION],
> +[
> +
> +])
> +
> +# DPCTL_CHECK_FRAGMENTATION_PASS()
> +#
> +# The kernel does not support this command.
> +m4_define([DPCTL_CHECK_FRAGMENTATION_PASS],
> +[
> +
> +])
> +
> +# DPCTL_CHECK_V6_FRAGMENTATION_PASS()
> +#
> +# The kernel does not support this command.
> +m4_define([DPCTL_CHECK_V6_FRAGMENTATION_PASS],
> +[
> +
> +])
> +
> +# DPCTL_CHECK_FRAGMENTATION_FAIL()
> +#
> +# The kernel does not support this command.
> +m4_define([DPCTL_CHECK_FRAGMENTATION_FAIL],
> +[
> +
> +])
> +
> # OVS_CHECK_KERNEL([minversion], [minsublevel], [maxversion],
> [maxsublevel])
> #
> # Check if kernel version falls between minversion.minsublevel and
> diff --git a/tests/system-traffic.at b/tests/system-traffic.at
> index de40734..6da5ac8 100644
> --- a/tests/system-traffic.at
> +++ b/tests/system-traffic.at
> @@ -2356,7 +2356,6 @@ AT_CLEANUP
>
> AT_SETUP([conntrack - IPv4 fragmentation])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> OVS_TRAFFIC_VSWITCHD_START()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
> @@ -2375,6 +2374,9 @@
> priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1
>
> AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
>
> +dnl Modify userspace conntrack fragmentation handling.
> +DPCTL_MODIFY_FRAGMENTATION()
> +
> dnl Ipv4 fragmentation connectivity check.
> NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.2 |
> FORMAT_PING], [0], [dnl
> 3 packets transmitted, 3 received, 0% packet loss, time 0ms
> @@ -2385,12 +2387,14 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i
> 0.3 -w 2 10.1.1.2 | FORMAT_PING
> 3 packets transmitted, 3 received, 0% packet loss, time 0ms
> ])
>
> +dnl Check userspace conntrack fragmentation counters.
> +DPCTL_CHECK_FRAGMENTATION_PASS()
> +
> OVS_TRAFFIC_VSWITCHD_STOP
> AT_CLEANUP
>
> AT_SETUP([conntrack - IPv4 fragmentation expiry])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> OVS_TRAFFIC_VSWITCHD_START()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
> @@ -2411,17 +2415,22 @@
> priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1
>
> AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
>
> +dnl Modify userspace conntrack fragmentation handling.
> +DPCTL_MODIFY_FRAGMENTATION()
> +
> dnl Ipv4 fragmentation connectivity check.
> NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 1 -i 0.3 -w 2 10.1.1.2 |
> FORMAT_PING], [0], [dnl
> 7 packets transmitted, 0 received, 100% packet loss, time 0ms
> ])
>
> +dnl Check userspace conntrack fragmentation counters.
> +DPCTL_CHECK_FRAGMENTATION_FAIL()
> +
> OVS_TRAFFIC_VSWITCHD_STOP
> AT_CLEANUP
>
> AT_SETUP([conntrack - IPv4 fragmentation + vlan])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> OVS_TRAFFIC_VSWITCHD_START()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
> @@ -2442,6 +2451,9 @@
> priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1
>
> AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
>
> +dnl Modify userspace conntrack fragmentation handling.
> +DPCTL_MODIFY_FRAGMENTATION()
> +
> dnl Ipv4 fragmentation connectivity check.
> NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 |
> FORMAT_PING], [0], [dnl
> 3 packets transmitted, 3 received, 0% packet loss, time 0ms
> @@ -2452,12 +2464,14 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i
> 0.3 -w 2 10.2.2.2 | FORMAT_PING
> 3 packets transmitted, 3 received, 0% packet loss, time 0ms
> ])
>
> +dnl Check userspace conntrack fragmentation counters.
> +DPCTL_CHECK_FRAGMENTATION_PASS()
> +
> OVS_TRAFFIC_VSWITCHD_STOP
> AT_CLEANUP
>
> AT_SETUP([conntrack - IPv4 fragmentation + cvlan])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> OVS_TRAFFIC_VSWITCHD_START([set Open_vSwitch . other_config:vlan-limit=0])
> OVS_CHECK_8021AD()
>
> @@ -2511,6 +2525,8 @@ AT_CLEANUP
> AT_SETUP([conntrack - IPv4 fragmentation incomplete reassembled packet])
> CHECK_CONNTRACK()
> OVS_TRAFFIC_VSWITCHD_START()
> +DPCTL_SET_MIN_FRAG_SIZE()
> +
>
> ADD_NAMESPACES(at_ns0, at_ns1)
>
> @@ -2532,8 +2548,8 @@ AT_CLEANUP
> dnl Uses same first fragment as above 'incomplete reassembled packet'
> test.
> AT_SETUP([conntrack - IPv4 fragmentation with fragments specified])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> OVS_TRAFFIC_VSWITCHD_START()
> +DPCTL_SET_MIN_FRAG_SIZE()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
>
> @@ -2556,8 +2572,8 @@ AT_CLEANUP
>
> AT_SETUP([conntrack - IPv4 fragmentation out of order])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> OVS_TRAFFIC_VSWITCHD_START()
> +DPCTL_SET_MIN_FRAG_SIZE()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
>
> @@ -2580,9 +2596,9 @@ AT_CLEANUP
>
> AT_SETUP([conntrack - IPv4 fragmentation overlapping fragments by 1
> octet])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> CHECK_CONNTRACK_FRAG_OVERLAP()
> OVS_TRAFFIC_VSWITCHD_START()
> +DPCTL_SET_MIN_FRAG_SIZE()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
>
> @@ -2604,9 +2620,9 @@ AT_CLEANUP
>
> AT_SETUP([conntrack - IPv4 fragmentation overlapping fragments by 1 octet
> out of order])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> CHECK_CONNTRACK_FRAG_OVERLAP()
> OVS_TRAFFIC_VSWITCHD_START()
> +DPCTL_SET_MIN_FRAG_SIZE()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
>
> @@ -2628,7 +2644,6 @@ AT_CLEANUP
>
> AT_SETUP([conntrack - IPv6 fragmentation])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> OVS_TRAFFIC_VSWITCHD_START()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
> @@ -2668,7 +2683,6 @@ AT_CLEANUP
>
> AT_SETUP([conntrack - IPv6 fragmentation expiry])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> OVS_TRAFFIC_VSWITCHD_START()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
> @@ -2709,7 +2723,6 @@ AT_CLEANUP
>
> AT_SETUP([conntrack - IPv6 fragmentation + vlan])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> OVS_TRAFFIC_VSWITCHD_START()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
> @@ -2752,7 +2765,6 @@ AT_CLEANUP
>
> AT_SETUP([conntrack - IPv6 fragmentation + cvlan])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> OVS_TRAFFIC_VSWITCHD_START([set Open_vSwitch . other_config:vlan-limit=0])
> OVS_CHECK_8021AD()
>
> @@ -2807,6 +2819,7 @@ AT_CLEANUP
> AT_SETUP([conntrack - IPv6 fragmentation incomplete reassembled packet])
> CHECK_CONNTRACK()
> OVS_TRAFFIC_VSWITCHD_START()
> +DPCTL_SET_MIN_FRAG_SIZE()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
>
> @@ -2827,8 +2840,8 @@ AT_CLEANUP
>
> AT_SETUP([conntrack - IPv6 fragmentation with fragments specified])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> OVS_TRAFFIC_VSWITCHD_START()
> +DPCTL_SET_MIN_FRAG_SIZE()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
>
> @@ -2851,8 +2864,8 @@ AT_CLEANUP
>
> AT_SETUP([conntrack - IPv6 fragmentation out of order])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> OVS_TRAFFIC_VSWITCHD_START()
> +DPCTL_SET_MIN_FRAG_SIZE()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
>
> @@ -2875,9 +2888,9 @@ AT_CLEANUP
>
> AT_SETUP([conntrack - IPv6 fragmentation, multiple extension headers])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN()
> OVS_TRAFFIC_VSWITCHD_START()
> +DPCTL_SET_MIN_FRAG_SIZE()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
>
> @@ -2901,9 +2914,9 @@ AT_CLEANUP
>
> AT_SETUP([conntrack - IPv6 fragmentation, multiple extension headers +
> out of order])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN()
> OVS_TRAFFIC_VSWITCHD_START()
> +DPCTL_SET_MIN_FRAG_SIZE()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
>
> @@ -2927,9 +2940,9 @@ AT_CLEANUP
>
> AT_SETUP([conntrack - IPv6 fragmentation, multiple extension headers 2])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN()
> OVS_TRAFFIC_VSWITCHD_START()
> +DPCTL_SET_MIN_FRAG_SIZE()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
>
> @@ -2953,9 +2966,9 @@ AT_CLEANUP
>
> AT_SETUP([conntrack - IPv6 fragmentation, multiple extension headers 2 +
> out of order])
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN()
> OVS_TRAFFIC_VSWITCHD_START()
> +DPCTL_SET_MIN_FRAG_SIZE()
>
> ADD_NAMESPACES(at_ns0, at_ns1)
>
> @@ -2980,7 +2993,6 @@ AT_CLEANUP
> AT_SETUP([conntrack - Fragmentation over vxlan])
> OVS_CHECK_VXLAN()
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> CHECK_CONNTRACK_LOCAL_STACK()
>
> OVS_TRAFFIC_VSWITCHD_START()
> @@ -3033,7 +3045,6 @@ AT_CLEANUP
> AT_SETUP([conntrack - IPv6 Fragmentation over vxlan])
> OVS_CHECK_VXLAN()
> CHECK_CONNTRACK()
> -CHECK_CONNTRACK_FRAG()
> CHECK_CONNTRACK_LOCAL_STACK()
>
> OVS_TRAFFIC_VSWITCHD_START()
> diff --git a/tests/system-userspace-macros.at b/tests/
> system-userspace-macros.at
> index 27bde8b..4ea55ea 100644
> --- a/tests/system-userspace-macros.at
> +++ b/tests/system-userspace-macros.at
> @@ -73,15 +73,6 @@ m4_define([CHECK_CONNTRACK],
> #
> m4_define([CHECK_CONNTRACK_ALG])
>
> -# CHECK_CONNTRACK_FRAG()
> -#
> -# Perform requirements checks for running conntrack fragmentations tests.
> -# The userspace doesn't support fragmentation yet, so skip the tests.
> -m4_define([CHECK_CONNTRACK_FRAG],
> -[
> - AT_SKIP_IF([:])
> -])
> -
> # CHECK_CONNTRACK_LOCAL_STACK()
> #
> # Perform requirements checks for running conntrack tests with local
> stack.
> @@ -95,19 +86,13 @@ m4_define([CHECK_CONNTRACK_LOCAL_STACK],
>
> # CHECK_CONNTRACK_FRAG_OVERLAP()
> #
> -# The userspace datapath does not support fragments yet.
> -m4_define([CHECK_CONNTRACK_FRAG_OVERLAP],
> -[
> - AT_SKIP_IF([:])
> -])
> +# The userspace datapath supports fragment overlap check.
> +m4_define([CHECK_CONNTRACK_FRAG_OVERLAP])
>
> -# CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN()
> +# CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN
> #
> -# The userspace datapath does not support fragments yet.
> -m4_define([CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN],
> -[
> - AT_SKIP_IF([:])
> -])
> +# The userspace datapath supports fragments with multiple extension
> headers.
> +m4_define([CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN])
>
> # CHECK_CONNTRACK_NAT()
> #
> @@ -137,6 +122,167 @@ m4_define([CHECK_CT_DPIF_SET_GET_MAXCONNS])
> # userspace datapath does support this feature.
> m4_define([CHECK_CT_DPIF_GET_NCONNS])
>
> +# DPCTL_SET_MIN_FRAG_SIZE()
> +#
> +# The userspace datapath supports this command.
> +m4_define([DPCTL_SET_MIN_FRAG_SIZE],
> +[
> +AT_CHECK([ovs-appctl dpctl/ipf-set-min-frag v4 400], [], [dnl
> +setting minimum fragment size successful
> +])
> +AT_CHECK([ovs-appctl dpctl/ipf-set-min-frag v6 400], [], [dnl
> +setting minimum fragment size successful
> +])
> +])
> +
> +# DPCTL_MODIFY_FRAGMENTATION()
> +#
> +# The userspace datapath supports this command.
> +m4_define([DPCTL_MODIFY_FRAGMENTATION],
> +[
> +AT_CHECK([ovs-appctl dpctl/ipf-set-min-frag v4 1000], [], [dnl
> +setting minimum fragment size successful
> +])
> +AT_CHECK([ovs-appctl dpctl/ipf-set-max-nfrags 500], [], [dnl
> +setting maximum fragments successful
> +])
> +AT_CHECK([ovs-appctl dpctl/ipf-get-status], [], [dnl
> + Fragmentation Module Status
> + ---------------------------
> + v4 enabled: 1
> + v6 enabled: 1
> + max num frags (v4/v6): 500
> + num frag: 0
> + min v4 frag size: 1000
> + v4 frags accepted: 0
> + v4 frags completed: 0
> + v4 frags expired: 0
> + v4 frags too small: 0
> + v4 frags overlapped: 0
> + v4 frags purged: 0
> + min v6 frag size: 1280
> + v6 frags accepted: 0
> + v6 frags completed: 0
> + v6 frags expired: 0
> + v6 frags too small: 0
> + v6 frags overlapped: 0
> + v6 frags purged: 0
> +])
> +])
> +
> +# DPCTL_CHECK_FRAGMENTATION_PASS()
> +#
> +# Used to check fragmentation counters for some fragmentation tests using
> +# the userspace datapath.
> +m4_define([DPCTL_CHECK_FRAGMENTATION_PASS],
> +[
> +AT_CHECK([ovs-appctl dpctl/ipf-get-status --more], [], [dnl
> + Fragmentation Module Status
> + ---------------------------
> + v4 enabled: 1
> + v6 enabled: 1
> + max num frags (v4/v6): 500
> + num frag: 0
> + min v4 frag size: 1000
> + v4 frags accepted: 30
> + v4 frags completed: 30
> + v4 frags expired: 0
> + v4 frags too small: 0
> + v4 frags overlapped: 0
> + v4 frags purged: 0
> + min v6 frag size: 1280
> + v6 frags accepted: 0
> + v6 frags completed: 0
> + v6 frags expired: 0
> + v6 frags too small: 0
> + v6 frags overlapped: 0
> + v6 frags purged: 0
> +
> + Fragment Lists:
> +
> +])
> +])
> +
> +# DPCTL_CHECK_V6_FRAGMENTATION_PASS()
> +#
> +# Used to check fragmentation counters for some fragmentation tests using
> +# the userspace datapath.
> +m4_define([DPCTL_CHECK_V6_FRAGMENTATION_PASS],
> +[
> +AT_CHECK([ovs-appctl dpctl/ipf-get-status --more], [], [dnl
> + Fragmentation Module Status
> + ---------------------------
> + v4 enabled: 1
> + v6 enabled: 1
> + max num frags (v4/v6): 1000
> + num frag: 0
> + min v4 frag size: 1200
> + v4 frags accepted: 0
> + v4 frags completed: 0
> + v4 frags expired: 0
> + v4 frags too small: 0
> + v4 frags overlapped: 0
> + v4 frags purged: 0
> + min v6 frag size: 1280
> + v6 frags accepted: 30
> + v6 frags completed: 30
> + v6 frags expired: 0
> + v6 frags too small: 0
> + v6 frags overlapped: 0
> + v6 frags purged: 0
> +
> + Fragment Lists:
> +
> +])
> +])
> +
> +# FORMAT_FRAG_LIST([])
> +#
> +# Strip content from the piped input which can differ from test to test;
> recirc_id
> +# and ip_id fields in an ipf_list vary from test to test and hence are
> cleared.
> +m4_define([FORMAT_FRAG_LIST],
> + [[sed -e 's/ip_id=[0-9]*/ip_id=<cleared>/g' -e
> 's/recirc_id=[0-9]*/recirc_id=<cleared>/g']])
> +
> +# DPCTL_CHECK_FRAGMENTATION_FAIL()
> +#
> +# Used to check fragmentation counters for some fragmentation tests using
> +# the userspace datapath, when failure to transmit fragments is expected.
> +m4_define([DPCTL_CHECK_FRAGMENTATION_FAIL],
> +[
> +AT_CHECK([ovs-appctl dpctl/ipf-get-status -m | FORMAT_FRAG_LIST()], [],
> [dnl
> + Fragmentation Module Status
> + ---------------------------
> + v4 enabled: 1
> + v6 enabled: 1
> + max num frags (v4/v6): 500
> + num frag: 7
> + min v4 frag size: 1000
> + v4 frags accepted: 7
> + v4 frags completed: 0
> + v4 frags expired: 0
> + v4 frags too small: 0
> + v4 frags overlapped: 0
> + v4 frags purged: 0
> + min v6 frag size: 1280
> + v6 frags accepted: 0
> + v6 frags completed: 0
> + v6 frags expired: 0
> + v6 frags too small: 0
> + v6 frags overlapped: 0
> + v6 frags purged: 0
> +
> + Fragment Lists:
> +
> +(src=10.1.1.1,dst=10.1.1.2,recirc_id=<cleared>,ip_id=<cleared>,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first
> frag)
> +(src=10.1.1.1,dst=10.1.1.2,recirc_id=<cleared>,ip_id=<cleared>,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first
> frag)
> +(src=10.1.1.1,dst=10.1.1.2,recirc_id=<cleared>,ip_id=<cleared>,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first
> frag)
> +(src=10.1.1.1,dst=10.1.1.2,recirc_id=<cleared>,ip_id=<cleared>,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first
> frag)
> +(src=10.1.1.1,dst=10.1.1.2,recirc_id=<cleared>,ip_id=<cleared>,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first
> frag)
> +(src=10.1.1.1,dst=10.1.1.2,recirc_id=<cleared>,ip_id=<cleared>,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first
> frag)
> +(src=10.1.1.1,dst=10.1.1.2,recirc_id=<cleared>,ip_id=<cleared>,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first
> frag)
> +])
> +])
> +
> # OVS_CHECK_KERNEL([minversion], [maxversion], [minsublevel],
> [maxsublevel])
> #
> # The userspace skips all tests that check kernel version.
> --
> 1.9.1
>
>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev