Re: [ovs-dev] [patch v3 2/9] Userspace datapath: Add v4 fragmentation handling.

Darrell Ball Thu, 25 Jan 2018 21:35:56 -0800


On 1/25/18, 6:07 PM, "[email protected] on behalf of Darrell 
Ball" <[email protected] on behalf of [email protected]> wrote:


    Fragmentation handling is added for supporting conntrack.
    Presently, only v4 is supported, with v6 coming soon.
    Fragmentation handling is disabled by default and enabled
    via a user command implemented in a subsequent patch.
    
    Signed-off-by: Darrell Ball <[email protected]>
    ---
     lib/automake.mk |   2 +
     lib/ipf.c       | 872 
++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     lib/ipf.h       |  53 ++++
     3 files changed, 927 insertions(+)
     create mode 100644 lib/ipf.c
     create mode 100644 lib/ipf.h
    
    diff --git a/lib/automake.mk b/lib/automake.mk
    index 159319f..6ca6a1e 100644
    --- a/lib/automake.mk
    +++ b/lib/automake.mk
    @@ -107,6 +107,8 @@ lib_libopenvswitch_la_SOURCES = \
        lib/hmapx.h \
        lib/id-pool.c \
        lib/id-pool.h \
    +   lib/ipf.c \
    +   lib/ipf.h \
        lib/jhash.c \
        lib/jhash.h \
        lib/json.c \
    diff --git a/lib/ipf.c b/lib/ipf.c
    new file mode 100644
    index 0000000..969334a
    --- /dev/null
    +++ b/lib/ipf.c
    @@ -0,0 +1,872 @@
    +/*
    + * Copyright (c) 2018 Nicira, Inc.
    + *
    + * Licensed under the Apache License, Version 2.0 (the "License");
    + * you may not use this file except in compliance with the License.
    + * You may obtain a copy of the License at:
    + *
    + *     
https://urldefense.proofpoint.com/v2/url?u=http-3A__www.apache.org_licenses_LICENSE-2D2.0&d=DwICAg&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=5yMLWew8DT45uDVolS3h78PpM9sFkVoldfI8Zx39M1I&s=CVfIsKHlF5ve4IsiPvVxQ-o1O9PjPq93hHvFNOIQ_F8&e=
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +#include <config.h>
    +#include <ctype.h>
    +#include <errno.h>
    +#include <sys/types.h>
    +#include <netinet/in.h>
    +#include <netinet/icmp6.h>
    +#include <string.h>
    +
    +#include "csum.h"
    +#include "ipf.h"
    +#include "openvswitch/hmap.h"
    +#include "openvswitch/vlog.h"
    +#include "ovs-atomic.h"
    +#include "util.h"
    +
    +VLOG_DEFINE_THIS_MODULE(ipf);
    +
    +enum {
    +    IPV4_PACKET_MAX_SIZE = 65535
    +};
    +
    +enum ipf_list_state {
    +    IPF_LIST_STATE_UNUSED,
    +    IPF_LIST_STATE_OTHER_SEEN,
    +    IPF_LIST_STATE_FIRST_SEEN,
    +    IPF_LIST_STATE_LAST_SEEN,
    +    IPF_LIST_STATE_FIRST_LAST_SEEN,
    +    IPF_LIST_STATE_COMPLETED,
    +};
    +
    +enum ipf_list_type {
    +    IPF_FRAG_COMPLETED_LIST,
    +    IPF_FRAG_EXPIRY_LIST,
    +};
    +
    +enum {
    +    IPF_INVALID_IDX = -1,
    +    FRAG_SIZE_LOWER_BOUND = 400,
    +    FRAG_SIZE_MIN_DEF = 1200,
    +    MAX_FRAGS_DEFAULT = 1000,
    +    NFRAG_UPPER_BOUND = 5000,
    +};
    +
    +struct ipf_addr {
    +    union {
    +        ovs_16aligned_be32 ipv4;
    +        union ovs_16aligned_in6_addr ipv6;
    +        ovs_be32 ipv4_aligned;
    +        struct in6_addr ipv6_aligned;
    +    };
    +};
    +
    +struct ipf_frag {
    +    struct dp_packet *pkt;
    +    uint16_t start_data_byte;
    +    uint16_t end_data_byte;
    +};
    +
    +struct ipf_list_key {
    +    struct ipf_addr src_addr;
    +    struct ipf_addr dst_addr;
    +    uint32_t recirc_id;
    +    ovs_be16 dl_type;
    +    ovs_be16 ip_id;   /* V6 is 32 bits. */
    +    uint16_t zone;
    +    uint8_t nw_proto;
    +};
    +
    +struct ipf_list {
    +    struct hmap_node node;
    +    struct ovs_list exp_node;
    +    struct ovs_list complete_node;
    +    struct ipf_frag *frag_list;
    +    struct ipf_list_key key;
    +    struct dp_packet *reass_execute_ctx;
    +    long long expiration;
    +    int last_sent_idx;
    +    int last_inuse_idx;
    +    int size;
    +    uint8_t state;
    +};
    +
    +struct reassembled_pkt {
    +    struct ovs_list rp_list_node;
    +    struct dp_packet *pkt;
    +    struct ipf_list *list;
    +};
    +
    +struct OVS_LOCKABLE ipf_lock {
    +    struct ovs_mutex lock;
    +};
    +
    +static int max_frag_list_size;
    +
    +static struct hmap frag_lists OVS_GUARDED;
    +static struct ovs_list frag_exp_list OVS_GUARDED;
    +static struct ovs_list frag_complete_list OVS_GUARDED;
    +static struct ovs_list reassembled_pkt_list OVS_GUARDED;
    +
    +static atomic_bool ifp_enabled;
    +static atomic_uint nfrag_max;
    +/* Will be clamped above 400 bytes; the value chosen should handle
    + * alg control packets of interest that use text encoding of mutable
    + * IP fields; meaning they should not be fragmented. */
    +static atomic_uint min_frag_size;
    +
    +static atomic_count nfrag;
    +static atomic_count nfrag_accepted;
    +static atomic_count nfrag_completed_sent;
    +static atomic_count nfrag_expired_sent;
    +static atomic_count nfrag_too_small;
    +static atomic_count n_overlap_frag;
    +
    +static struct ipf_lock ipf_lock;
    +
    +static void ipf_lock_init(struct ipf_lock *lock)
    +{
    +    ovs_mutex_init_adaptive(&lock->lock);
    +}
    +
    +static void ipf_lock_lock(struct ipf_lock *lock)
    +    OVS_ACQUIRES(lock)
    +    OVS_NO_THREAD_SAFETY_ANALYSIS
    +{
    +    ovs_mutex_lock(&lock->lock);
    +}
    +
    +static void ipf_lock_unlock(struct ipf_lock *lock)
    +    OVS_RELEASES(lock)
    +    OVS_NO_THREAD_SAFETY_ANALYSIS
    +{
    +    ovs_mutex_unlock(&lock->lock);
    +}
    +
    +static void ipf_lock_destroy(struct ipf_lock *lock)
    +{
    +    ovs_mutex_destroy(&lock->lock);
    +}
    +
    +static bool
    +ipf_get_enabled(void)
    +{
    +    bool ifp_enabled_;
    +    atomic_read_relaxed(&ifp_enabled, &ifp_enabled_);
    +    return ifp_enabled_;
    +}
    +
    +static uint32_t
    +ipf_addr_hash_add(uint32_t hash, const struct ipf_addr *addr)
    +{
    +    BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
    +    return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
    +}
    +
    +/* The list access functions are called with the ipf_lock held. */
    +static void
    +ipf_expiry_list_add(struct ipf_list *ipf_list, long long now)
    +{
    +    enum {
    +        IPF_FRAG_LIST_TIMEOUT_DEFAULT = 15000,
    +    };
    +
    +    ipf_list->expiration = now + IPF_FRAG_LIST_TIMEOUT_DEFAULT;
    +    ovs_list_push_back(&frag_exp_list, &ipf_list->exp_node);
    +}
    +
    +static void
    +ipf_completed_list_add(struct ipf_list *ipf_list)
    +{
    +    ovs_list_push_back(&frag_complete_list, &ipf_list->complete_node);
    +}
    +
    +static void
    +ipf_reassembled_list_add(struct reassembled_pkt *rp)
    +{
    +    ovs_list_push_back(&reassembled_pkt_list, &rp->rp_list_node);
    +}
    +
    +static void
    +ipf_expiry_list_remove(struct ipf_list *ipf_list)
    +{
    +    ovs_list_remove(&ipf_list->exp_node);
    +}
    +
    +static void
    +ipf_completed_list_remove(struct ipf_list *ipf_list)
    +{
    +    ovs_list_remove(&ipf_list->complete_node);
    +}
    +
    +static void
    +ipf_reassembled_list_remove(struct reassembled_pkt *rp)
    +{
    +    ovs_list_remove(&rp->rp_list_node);
    +}
    +
    +/* Symmetric */
    +static uint32_t
    +ipf_list_key_hash(const struct ipf_list_key *key, uint32_t basis)
    +{
    +    uint32_t hsrc, hdst, hash;
    +    hsrc = hdst = basis;
    +    hsrc = ipf_addr_hash_add(hsrc, &key->src_addr);
    +    hdst = ipf_addr_hash_add(hdst, &key->dst_addr);
    +    hash = hsrc ^ hdst;
    +
    +    /* Hash the rest of the key. */
    +    hash = hash_words((uint32_t *) (&key->dst_addr + 1),
    +                      (uint32_t *) (key + 1) -
    +                          (uint32_t *) (&key->dst_addr + 1),
    +                      hash);
    +
    +    return hash_finish(hash, 0);
    +}
    +
    +static bool
    +ipf_is_first_fragment(const struct dp_packet *pkt)
    +{
    +    const struct ip_header *l3 = dp_packet_l3(pkt);
    +    if (!(l3->ip_frag_off & htons(IP_FRAG_OFF_MASK)) &&
    +        l3->ip_frag_off & htons(IP_MORE_FRAGMENTS)) {
    +        return true;
    +    }
    +    return false;
    +}
    +
    +static bool
    +ipf_is_last_fragment(const struct dp_packet *pkt)
    +{
    +    const struct ip_header *l3 = dp_packet_l3(pkt);
    +    if (l3->ip_frag_off & htons(IP_FRAG_OFF_MASK) &&
    +        !(l3->ip_frag_off & htons(IP_MORE_FRAGMENTS))) {
    +        return true;
    +    }
    +    return false;
    +}
    +
    +/* This function is called with the ipf_lock held. */
    +static bool
    +ipf_list_complete(const struct ipf_list *ipf_list)
    +{
    +    for (int i = 0; i < ipf_list->last_inuse_idx; i++) {
    +        if (ipf_list->frag_list[i].end_data_byte + 1
    +            != ipf_list->frag_list[i+1].start_data_byte) {
    +            return false;
    +        }
    +    }
    +    return true;
    +}
    +
    +/* Runs O(n) for a sorted or almost sorted list. */
    +/* This function is called with the ipf_lock held. */
    +static void
    +ipf_sort(struct ipf_frag *frag_list, size_t last_idx)
    +{
    +    int running_last_idx = 1;
    +    struct ipf_frag ipf_frag;
    +    while (running_last_idx <= last_idx) {
    +        ipf_frag = frag_list[running_last_idx];
    +        int frag_list_idx = running_last_idx - 1;
    +        while (frag_list_idx >= 0 &&
    +               frag_list[frag_list_idx].start_data_byte >
    +                   ipf_frag.start_data_byte) {
    +            frag_list[frag_list_idx + 1] = frag_list[frag_list_idx];
    +            frag_list_idx -= 1;
    +        }
    +        frag_list[frag_list_idx + 1] = ipf_frag;
    +        running_last_idx++;
    +    }
    +}
    +
    +/* Called on a sorted complete list of fragments. */
    +/* This function is called with the ipf_lock held. */
    +static struct dp_packet *
    +ipf_reassemble_frags(struct ipf_list *ipf_list)
    +{
    +    struct ipf_frag *frag_list = ipf_list->frag_list;
    +    struct dp_packet *pkt = dp_packet_clone(frag_list[0].pkt);
    +    struct ip_header *l3 = dp_packet_l3(pkt);
    +    int len = ntohs(l3->ip_tot_len);
    +
    +    for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
    +        l3 = dp_packet_l3(frag_list[i].pkt);
    +        len += frag_list[i].end_data_byte - frag_list[i].start_data_byte + 
1;
    +        if (len > IPV4_PACKET_MAX_SIZE) {
    +            dp_packet_delete(pkt);
    +            return NULL;
    +        }
    +        size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4;
    +        size_t add_len = frag_list[i].end_data_byte -
    +                         frag_list[i].start_data_byte + 1;
    +        dp_packet_put(pkt, (char *)l3 + ip_hdr_len, add_len);
    +    }
    +    l3 = dp_packet_l3(pkt);
    +    ovs_be16 new_ip_frag_off = l3->ip_frag_off & ~htons(IP_MORE_FRAGMENTS);
    +    l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_frag_off,
    +                                new_ip_frag_off);
    +    l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_tot_len, htons(len));
    +    l3->ip_tot_len = htons(len);
    +    l3->ip_frag_off = new_ip_frag_off;
    +
    +    return pkt;
    +}
    +
    +/* Called when a valid fragment is added. */
    +/* This function is called with the ipf_lock held. */
    +static void
    +ipf_list_state_transition(struct ipf_list *ipf_list,
    +                          const struct dp_packet *pkt)
    +{
    +    enum ipf_list_state curr_state = ipf_list->state;
    +    enum ipf_list_state next_state;
    +    switch (curr_state) {
    +    case IPF_LIST_STATE_UNUSED:
    +    case IPF_LIST_STATE_OTHER_SEEN:
    +        if (ipf_is_first_fragment(pkt)) {
    +            next_state = IPF_LIST_STATE_FIRST_SEEN;
    +        } else if (ipf_is_last_fragment(pkt)) {
    +            next_state = IPF_LIST_STATE_LAST_SEEN;
    +        } else {
    +            next_state = IPF_LIST_STATE_OTHER_SEEN;
    +        }
    +        break;
    +    case IPF_LIST_STATE_FIRST_SEEN:
    +        if (ipf_is_first_fragment(pkt)) {
    +            next_state = IPF_LIST_STATE_FIRST_SEEN;
    +        } else if (ipf_is_last_fragment(pkt)) {
    +            next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
    +        } else {
    +            next_state = IPF_LIST_STATE_FIRST_SEEN;
    +        }
    +        break;
    +    case IPF_LIST_STATE_LAST_SEEN:
    +        if (ipf_is_first_fragment(pkt)) {
    +            next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
    +        } else if (ipf_is_last_fragment(pkt)) {
    +            next_state = IPF_LIST_STATE_LAST_SEEN;
    +        } else {
    +            next_state = IPF_LIST_STATE_LAST_SEEN;
    +        }
    +        break;
    +    case IPF_LIST_STATE_FIRST_LAST_SEEN:
    +        next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
    +        ipf_sort(ipf_list->frag_list, ipf_list->last_inuse_idx);
    +        break;
    +    case IPF_LIST_STATE_COMPLETED:
    +        next_state = curr_state;
    +        break;
    +    default:
    +        OVS_NOT_REACHED();
    +    }
    +
    +    if (next_state == IPF_LIST_STATE_FIRST_LAST_SEEN &&
    +        ipf_list_complete(ipf_list)) {
    +
    +        next_state = IPF_LIST_STATE_COMPLETED;
    +        struct dp_packet *reass_pkt = ipf_reassemble_frags(ipf_list);
    +        if (reass_pkt) {
    +            struct reassembled_pkt *rp = xzalloc(sizeof *rp);
    +            rp->pkt = reass_pkt;
    +            rp->list = ipf_list;
    +            ipf_reassembled_list_add(rp);
    +            ipf_expiry_list_remove(ipf_list);
    +        }
    +    }
    +    ipf_list->state = next_state;
    +}
    +
    +static bool
    +ipf_key_extract(const struct dp_packet *pkt, ovs_be16 dl_type,
    +                uint16_t zone, struct ipf_list_key *key)
    +{
    +    bool hwol_bad_l3_csum = dp_packet_ip_checksum_bad(pkt);
    +    if (hwol_bad_l3_csum) {
    +        return false;
    +    }
    +
    +    const struct eth_header *l2 = dp_packet_eth(pkt);
    +    const struct ip_header *l3 = dp_packet_l3(pkt);
    +
    +    if (!l2 || !l3) {
    +        return false;
    +    }
    +
    +    const char *tail = dp_packet_tail(pkt);
    +    uint8_t pad = dp_packet_l2_pad_size(pkt);
    +    size_t size = tail - (char *)l3 -pad;
    +    if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
    +        return false;
    +    }
    +
    +    if (ntohs(l3->ip_tot_len) != size) {
    +        return false;
    +    }
    +
    +    if (!(IP_IS_FRAGMENT(l3->ip_frag_off))) {
    +        return false;
    +    }
    +
    +    size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4;
    +    if (OVS_UNLIKELY(ip_hdr_len < IP_HEADER_LEN)) {
    +        return false;
    +    }
    +    if (OVS_UNLIKELY(size < ip_hdr_len)) {
    +        return false;
    +    }
    +
    +    bool hwol_good_l3_csum = dp_packet_ip_checksum_valid(pkt);
    +    if (!hwol_good_l3_csum && csum(l3, ip_hdr_len) != 0) {
    +        return false;
    +    }
    +
    +    uint32_t min_frag_size_;
    +    atomic_read_relaxed(&min_frag_size, &min_frag_size_);
    +    if (!ipf_is_last_fragment(pkt) &&
    +        dp_packet_size(pkt) <= min_frag_size_) {
    +        atomic_count_inc(&nfrag_too_small);
    +        return false;
    +    }
    +
    +    memset(key, 0, sizeof *key);
    +    key->ip_id = l3->ip_id;
    +    key->dl_type = dl_type;
    +    key->src_addr.ipv4 = l3->ip_src;
    +    key->dst_addr.ipv4 = l3->ip_dst;
    +    key->nw_proto = l3->ip_proto;
    +    key->zone = zone;
    +    key->recirc_id = pkt->md.recirc_id;
    +    return true;
    +}
    +
    +/* This function is called with the ipf_lock held. */
    +static int
    +ipf_list_key_cmp(const struct ipf_list_key *key1,
    +                 const struct ipf_list_key *key2)
    +{
    +    if (!memcmp(&key1->src_addr, &key2->src_addr, sizeof key1->src_addr) &&
    +        !memcmp(&key1->dst_addr, &key2->dst_addr, sizeof key1->dst_addr) &&
    +        (key1->dl_type == key2->dl_type) &&
    +        (key1->ip_id == key2->ip_id) &&
    +        (key1->zone == key2->zone) &&
    +        (key1->nw_proto == key2->nw_proto) &&
    +        (key1->recirc_id == key2->recirc_id)) {
    +        return 0;
    +    }
    +    return 1;
    +}
    +
    +/* This function is called with the ipf_lock held. */
    +static struct ipf_list *
    +ipf_list_key_lookup(struct hmap *frag_lists,
    +                    const struct ipf_list_key *key,
    +                    uint32_t hash)
    +{
    +    struct ipf_list *ipf_list;
    +    HMAP_FOR_EACH_WITH_HASH (ipf_list, node, hash, frag_lists) {
    +        if (!ipf_list_key_cmp(&ipf_list->key, key)) {
    +            return ipf_list;
    +        }
    +    }
    +    return NULL;
    +}
    +
    +/* This function is called with the ipf_lock held. */
    +static bool
    +ipf_is_frag_duped(const struct ipf_frag *frag_list, int last_inuse_idx,
    +                  size_t start_data_byte, size_t end_data_byte)
    +{
    +    for (int i = 0; i <= last_inuse_idx; i++) {
    +        if (((start_data_byte >= frag_list[i].start_data_byte) &&
    +            (start_data_byte <= frag_list[i].end_data_byte)) ||
    +            ((end_data_byte >= frag_list[i].start_data_byte) &&
    +             (end_data_byte <= frag_list[i].end_data_byte))) {
    +            return true;
    +        }
    +    }
    +    return false;
    +}
    +
    +/* This function is called with the ipf_lock held. */
    +static bool
    +ipf_process_frag(struct ipf_list *ipf_list, struct dp_packet *pkt)
    +{
    +    const struct ip_header *l3 = dp_packet_l3(pkt);
    +    uint16_t ip_tot_len = ntohs(l3->ip_tot_len);
    +    uint16_t ip_off = ntohs(l3->ip_frag_off & htons(IP_FRAG_OFF_MASK)) * 8;
    +    size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4;
    +    uint16_t end_data_byte = ip_off + ip_tot_len - ip_hdr_len - 1;
    +
    +    bool duped_frag = ipf_is_frag_duped(ipf_list->frag_list,
    +        ipf_list->last_inuse_idx, ip_off, end_data_byte);
    +    int last_inuse_idx = ipf_list->last_inuse_idx;
    +
    +    if (!duped_frag) {
    +        if (last_inuse_idx < ipf_list->size - 1) {
    +            /* In the case of dpdk, it would be unfortunate if we had
    +             * to create a clone fragment outside the dpdk mp due to the
    +             * mempool size being too limited. We will otherwise need to
    +             * recommend not setting the mempool number of buffers too low
    +             * and also clamp the number of fragments. */
    +            ipf_list->frag_list[last_inuse_idx + 1].pkt = pkt;
    +            ipf_list->frag_list[last_inuse_idx + 1].start_data_byte =
    +                ip_off;
    +            ipf_list->frag_list[last_inuse_idx + 1].end_data_byte =
    +               end_data_byte;
    +            ipf_list->last_inuse_idx++;
    +            atomic_count_inc(&nfrag);
    +            atomic_count_inc(&nfrag_accepted);
    +            ipf_list_state_transition(ipf_list, pkt);
    +        } else {
    +            OVS_NOT_REACHED();
    +        }
    +    } else {
    +        atomic_count_inc(&n_overlap_frag);
    +        pkt->md.ct_state = CS_INVALID;
    +        return false;
    +    }
    +    return true;
    +}
    +
    +/* This function is called with the ipf_lock held. */
    +static bool
    +ipf_handle_frag(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone,
    +                long long now, uint32_t hash_basis)
    +{
    +    struct ipf_list_key key;
    +    bool rc = ipf_key_extract(pkt, dl_type, zone, &key);
    +    if (!rc) {
    +        return false;
    +    }
    +
    +    unsigned int nfrag_max_;
    +    atomic_read_relaxed(&nfrag_max, &nfrag_max_);
    +    if (atomic_count_get(&nfrag) >= nfrag_max_) {
    +        return false;
    +    }
    +
    +    uint32_t hash = ipf_list_key_hash(&key, hash_basis);
    +    struct ipf_list *ipf_list =
    +        ipf_list_key_lookup(&frag_lists, &key, hash);
    +    enum {
    +        IPF_FRAG_LIST_MIN_INCREMENT = 4,
    +    };
    +
    +    if (!ipf_list) {
    +        ipf_list = xzalloc(sizeof *ipf_list);
    +        ipf_list->key = key;
    +        ipf_list->last_inuse_idx = IPF_INVALID_IDX;
    +        ipf_list->last_sent_idx = IPF_INVALID_IDX;
    +        ipf_list->size =
    +            MIN(max_frag_list_size, IPF_FRAG_LIST_MIN_INCREMENT);
    +        ipf_list->frag_list =
    +            xzalloc(ipf_list->size * sizeof *ipf_list->frag_list);
    +        hmap_insert(&frag_lists, &ipf_list->node, hash);
    +        ipf_expiry_list_add(ipf_list, now);
    +    } else if (ipf_list->last_inuse_idx + 1 >= ipf_list->size) {
    +        int increment = MIN(IPF_FRAG_LIST_MIN_INCREMENT,
    +                            max_frag_list_size - ipf_list->size);
    +        /* Enforce limit. */
    +        if (increment) {

s/if (increment)/if (increment > 0) {

    +            ipf_list->frag_list =
    +                xrealloc(ipf_list->frag_list, (ipf_list->size + increment) 
*
    +                  sizeof *ipf_list->frag_list);
    +            ipf_list->size += increment;
    +        } else {
    +            return false;
    +        }
    +    }
    +
    +    rc = ipf_process_frag(ipf_list, pkt);
    +
    +    return rc;
    +}
    +
    +/* Handles V4 fragments right now. */
    +static void
    +ipf_extract_frags_from_batch(struct dp_packet_batch *pb, ovs_be16 dl_type,
    +                             uint16_t zone, long long now, uint32_t 
hash_basis)
    +{
    +    if (dl_type != htons(ETH_TYPE_IP)) {
    +        return;
    +    }
    +
    +    const size_t pb_cnt = dp_packet_batch_size(pb);
    +    int pb_idx; /* Index in a packet batch. */
    +    struct dp_packet *pkt;
    +
    +    DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
    +        ipf_lock_lock(&ipf_lock);
    +
    +        if (!ipf_handle_frag(pkt, dl_type, zone, now, hash_basis)) {
    +            dp_packet_batch_refill(pb, pkt, pb_idx);
    +        }
    +
    +        ipf_lock_unlock(&ipf_lock);
    +    }
    +}
    +
    +static bool
    +ipf_dp_packet_batch_add(struct dp_packet_batch *pb , struct dp_packet *pkt)
    +{
    +    if (pb->count < NETDEV_MAX_BURST) {
    +        dp_packet_batch_add(pb, pkt);
    +        return true;
    +    }
    +    return false;
    +}
    +
    +/* This function is called with the ipf_lock held. */
    +static bool
    +ipf_send_frags_in_list(struct ipf_list *ipf_list, struct dp_packet_batch 
*pb,
    +                       enum ipf_list_type list_type)
    +{
    +    while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
    +        if (ipf_dp_packet_batch_add(pb,
    +                ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt)) {
    +
    +            ipf_list->last_sent_idx++;
    +            atomic_count_dec(&nfrag);
    +
    +            if (list_type == IPF_FRAG_COMPLETED_LIST) {
    +                atomic_count_inc(&nfrag_completed_sent);
    +            } else {
    +                atomic_count_inc(&nfrag_expired_sent);
    +            }
    +
    +            if (ipf_list->last_sent_idx == ipf_list->last_inuse_idx) {
    +                return true;
    +            }
    +        } else {
    +            return false;
    +        }
    +    }
    +    return false;
    +}
    +
    +/* This function is called with the ipf_lock held. */
    +static void
    +ipf_list_remove(struct ipf_list *ipf_list, enum ipf_list_type list_type)
    +{
    +    if (list_type == IPF_FRAG_COMPLETED_LIST) {
    +        ipf_completed_list_remove(ipf_list);
    +    } else {
    +        ipf_expiry_list_remove(ipf_list);
    +    }
    +    hmap_remove(&frag_lists, &ipf_list->node);
    +    free(ipf_list->frag_list);
    +    free(ipf_list);
    +}
    +
    +static void
    +ipf_send_completed_frags(struct dp_packet_batch *pb)
    +{
    +    struct ipf_list *ipf_list, *next;
    +    ipf_lock_lock(&ipf_lock);
    +
    +    if (ovs_list_is_empty(&frag_complete_list)) {
    +        ipf_lock_unlock(&ipf_lock);
    +        return;
    +    }
    +
    +    LIST_FOR_EACH_SAFE (ipf_list, next, complete_node, 
&frag_complete_list) {
    +        if (ipf_send_frags_in_list(ipf_list, pb, IPF_FRAG_COMPLETED_LIST)) 
{
    +            ipf_list_remove(ipf_list, IPF_FRAG_COMPLETED_LIST);
    +        } else {
    +            break;
    +        }
    +    }
    +    ipf_lock_unlock(&ipf_lock);
    +}
    +
    +static void
    +ipf_send_expired_frags(struct dp_packet_batch *pb, long long now)
    +{
    +    enum {
    +        /* Very conservative, due to DOS probability. */
    +        IPF_FRAG_LIST_MAX_EXPIRED = 1,
    +    };
    +
    +    struct ipf_list *ipf_list, *next;
    +    size_t lists_removed = 0;
    +    ipf_lock_lock(&ipf_lock);
    +
    +    if (ovs_list_is_empty(&frag_exp_list)) {
    +        ipf_lock_unlock(&ipf_lock);
    +        return;
    +    }
    +
    +    LIST_FOR_EACH_SAFE (ipf_list, next, exp_node, &frag_exp_list) {
    +        if (!(now > ipf_list->expiration) ||
    +            lists_removed >= IPF_FRAG_LIST_MAX_EXPIRED) {
    +            break;
    +        }
    +
    +        if (ipf_send_frags_in_list(ipf_list, pb, IPF_FRAG_EXPIRY_LIST)) {
    +            ipf_list_remove(ipf_list, IPF_FRAG_EXPIRY_LIST);
    +            lists_removed++;
    +        } else {
    +            break;
    +        }
    +    }
    +    ipf_lock_unlock(&ipf_lock);
    +}
    +
    +static void
    +ipf_execute_reass_pkts(struct dp_packet_batch *pb)
    +{
    +    struct reassembled_pkt *rp, *next;
    +    ipf_lock_lock(&ipf_lock);
    +
    +    if (ovs_list_is_empty(&reassembled_pkt_list)) {
    +        ipf_lock_unlock(&ipf_lock);
    +        return;
    +    }
    +
    +    LIST_FOR_EACH_SAFE (rp, next, rp_list_node, &reassembled_pkt_list) {
    +        if (!rp->list->reass_execute_ctx &&
    +            ipf_dp_packet_batch_add(pb, rp->pkt)) {
    +            rp->list->reass_execute_ctx = rp->pkt;
    +        }
    +    }
    +    ipf_lock_unlock(&ipf_lock);
    +}
    +
    +static void
    +ipf_post_execute_reass_pkts(struct dp_packet_batch *pb)
    +{
    +    struct reassembled_pkt *rp, *next;
    +    ipf_lock_lock(&ipf_lock);
    +
    +    if (ovs_list_is_empty(&reassembled_pkt_list)) {
    +        ipf_lock_unlock(&ipf_lock);
    +        return;
    +    }
    +
    +    LIST_FOR_EACH_SAFE (rp, next, rp_list_node, &reassembled_pkt_list) {
    +        const size_t pb_cnt = dp_packet_batch_size(pb);
    +        int pb_idx;
    +        struct dp_packet *pkt;
    +
    +        /* Inner batch loop is constant time since batch size is <=
    +         * NETDEV_MAX_BURST. */
    +        DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
    +            if (pkt == rp->list->reass_execute_ctx) {
    +
    +                for (int i = 0; i <= rp->list->last_inuse_idx; i++) {
    +                    rp->list->frag_list[i].pkt->md.ct_label = 
pkt->md.ct_label;
    +                    rp->list->frag_list[i].pkt->md.ct_mark = 
pkt->md.ct_mark;
    +                    rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 =
    +                        pkt->md.ct_orig_tuple_ipv6;
    +                    rp->list->frag_list[i].pkt->md.ct_state = 
pkt->md.ct_state;
    +                    rp->list->frag_list[i].pkt->md.ct_zone = 
pkt->md.ct_zone;
    +                }
    +
    +                const char *tail_frag =
    +                    dp_packet_tail(rp->list->frag_list[0].pkt);
    +                uint8_t pad_frag =
    +                    dp_packet_l2_pad_size(rp->list->frag_list[0].pkt);
    +                struct ip_header *l3_frag =
    +                    dp_packet_l3(rp->list->frag_list[0].pkt);
    +                struct ip_header *l3_reass = dp_packet_l3(pkt);
    +                void *l4_frag = dp_packet_l4(rp->list->frag_list[0].pkt);
    +                void *l4_reass = dp_packet_l4(pkt);
    +                memcpy(l4_frag, l4_reass,
    +                       tail_frag - (char *) l4_frag - pad_frag);
    +
    +                ovs_be32 reass_ip;
    +                ovs_be32 frag_ip;
    +                memcpy(&reass_ip, &l3_reass->ip_src, sizeof reass_ip);
    +                memcpy(&frag_ip, &l3_frag->ip_src, sizeof frag_ip);
    +                l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
    +                                                 frag_ip, reass_ip);
    +                memcpy(&l3_frag->ip_src, &l3_reass->ip_src,
    +                       sizeof l3_frag->ip_src);
    +                memcpy(&reass_ip, &l3_reass->ip_dst, sizeof reass_ip);
    +                memcpy(&frag_ip, &l3_frag->ip_dst, sizeof frag_ip);
    +                l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
    +                                                 frag_ip, reass_ip);
    +                memcpy(&l3_frag->ip_dst, &l3_reass->ip_dst,
    +                       sizeof l3_frag->ip_dst);
    +
    +                ipf_completed_list_add(rp->list);
    +                ipf_reassembled_list_remove(rp);
    +                dp_packet_delete(rp->pkt);
    +                free(rp);
    +            } else {
    +                dp_packet_batch_refill(pb, pkt, pb_idx);
    +            }
    +        }
    +    }
    +    ipf_lock_unlock(&ipf_lock);
    +}
    +
    +void
    +ipf_preprocess_conntrack(struct dp_packet_batch *pb, long long now,
    +                         ovs_be16 dl_type, uint16_t zone, uint32_t 
hash_basis)
    +{
    +    if (ipf_get_enabled()) {
    +        ipf_extract_frags_from_batch(pb, dl_type, zone, now, hash_basis);
    +        ipf_execute_reass_pkts(pb);
    +    }
    +}
    +
    +void
    +ipf_postprocess_conntrack(struct dp_packet_batch *pb, long long now)
    +{
    +    if (ipf_get_enabled()) {
    +        ipf_post_execute_reass_pkts(pb);
    +        ipf_send_completed_frags(pb);
    +        ipf_send_expired_frags(pb, now);
    +    }
    +}
    +
    +void
    +ipf_init(void)
    +{
    +    ipf_lock_init(&ipf_lock);
    +    ipf_lock_lock(&ipf_lock);
    +    hmap_init(&frag_lists);
    +    ovs_list_init(&frag_exp_list);
    +    ovs_list_init(&frag_complete_list);
    +    ovs_list_init(&reassembled_pkt_list);
    +    atomic_init(&min_frag_size, FRAG_SIZE_MIN_DEF);
    +    max_frag_list_size = DIV_ROUND_UP(IPV4_PACKET_MAX_SIZE, min_frag_size);
    +    ipf_lock_unlock(&ipf_lock);
    +    atomic_count_init(&nfrag, 0);
    +    atomic_count_init(&nfrag_accepted, 0);
    +    atomic_count_init(&nfrag_completed_sent, 0);
    +    atomic_count_init(&nfrag_expired_sent, 0);
    +    atomic_count_init(&nfrag_too_small, 0);
    +    atomic_count_init(&n_overlap_frag, 0);
    +    atomic_init(&nfrag_max, MAX_FRAGS_DEFAULT);
    +    atomic_init(&ifp_enabled, false);
    +}
    +
    +void
    +ipf_destroy(void)
    +{
    +    ipf_lock_lock(&ipf_lock);
    +
    +    struct ipf_list *ipf_list;
    +    HMAP_FOR_EACH_POP (ipf_list, node, &frag_lists) {
    +        free(ipf_list->frag_list);
    +        free(ipf_list);
    +    }
    +
    +    struct reassembled_pkt * rp;
    +    LIST_FOR_EACH_POP (rp, rp_list_node, &reassembled_pkt_list) {
    +        free(rp->pkt);
    +        free(rp);
    +    }
    +
    +    hmap_destroy(&frag_lists);
    +    ipf_lock_unlock(&ipf_lock);
    +    ipf_lock_destroy(&ipf_lock);
    +}
    diff --git a/lib/ipf.h b/lib/ipf.h
    new file mode 100644
    index 0000000..992fca0
    --- /dev/null
    +++ b/lib/ipf.h
    @@ -0,0 +1,53 @@
    +/*
    + * Copyright (c) 2018 Nicira, Inc.
    + *
    + * Licensed under the Apache License, Version 2.0 (the "License");
    + * you may not use this file except in compliance with the License.
    + * You may obtain a copy of the License at:
    + *
    + *     
https://urldefense.proofpoint.com/v2/url?u=http-3A__www.apache.org_licenses_LICENSE-2D2.0&d=DwICAg&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=5yMLWew8DT45uDVolS3h78PpM9sFkVoldfI8Zx39M1I&s=CVfIsKHlF5ve4IsiPvVxQ-o1O9PjPq93hHvFNOIQ_F8&e=
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +#ifndef IPF_H
    +#define IPF_H 1
    +
    +#include "dp-packet.h"
    +#include "openvswitch/types.h"
    +
    +struct ipf_status {
    +   bool ifp_enabled;
    +   unsigned int min_frag_size;
    +   unsigned int nfrag_max;
    +   unsigned int nfrag;
    +   unsigned int nfrag_accepted;
    +   unsigned int nfrag_completed_sent;
    +   unsigned int nfrag_expired_sent;
    +   unsigned int nfrag_too_small;
    +   unsigned int n_overlap_frag;
    +};
    +
    +/* Collects and reassembles fragments which are to be sent through
    + * conntrack, if fragment processing is enabled. */
    +void
    +ipf_preprocess_conntrack(struct dp_packet_batch *pb, long long now,
    +                         ovs_be16 dl_type, uint16_t zone, uint32_t 
hash_basis);
    +
    +/* Updates the state of fragments associated with reassembled packets and
    + * sends out fragments that are either associated with completed
    + * packets or expired, if fragment processing is enabled. */
    +void
    +ipf_postprocess_conntrack(struct dp_packet_batch *pb, long long now);
    +
    +void
    +ipf_init(void);
    +
    +void
    +ipf_destroy(void);
    +
    +#endif /* ipf.h */
    -- 
    1.9.1
    
    _______________________________________________
    dev mailing list
    [email protected]
    
https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwICAg&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=5yMLWew8DT45uDVolS3h78PpM9sFkVoldfI8Zx39M1I&s=wsr7IRFR52vKT612ZZKUSvvdjyuUOcSV34d3Alckpxo&e=
    

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [patch v3 2/9] Userspace datapath: Add v4 fragmentation handling.

Reply via email to