On 3/8/17, 6:14 PM, "ovs-dev-boun...@openvswitch.org on behalf of Daniele Di Proietto" <ovs-dev-boun...@openvswitch.org on behalf of diproiet...@ovn.org> wrote:
2017-02-16 0:47 GMT-08:00 Darrell Ball <dlu...@gmail.com>: > This patch introduces NAT support for the userspace datapath. > The conntrack module changes are in this patch. > > The per packet scope of lookups for NAT and un_NAT is at > the bucket level rather than global. One hash table is > introduced to support create/delete handling. The create/delete > events may be further optimized, if the need becomes clear. > > Some NAT options with limited utility (persistent, random) are > not supported yet, but will be supported in a later patch. > > Signed-off-by: Darrell Ball <dlu...@gmail.com> Thanks for the patch, I'll keep looking at this, but since you're about to send another version I had one comment below. > --- > lib/conntrack-private.h | 16 +- > lib/conntrack.c | 782 ++++++++++++++++++++++++++++++++++++++++++------ > lib/conntrack.h | 46 +++ > 3 files changed, 751 insertions(+), 93 deletions(-) > > diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h > index 493865f..a7c2ae4 100644 > --- a/lib/conntrack-private.h > +++ b/lib/conntrack-private.h > @@ -51,14 +51,23 @@ struct conn_key { > uint16_t zone; > }; > > +struct nat_conn_key_node { > + struct hmap_node node; > + struct conn_key key; > + struct conn_key value; > +}; > + > struct conn { > struct conn_key key; > struct conn_key rev_key; > long long expiration; > struct ovs_list exp_node; > struct hmap_node node; > - uint32_t mark; > ovs_u128 label; > + /* XXX: consider flattening. */ > + struct nat_action_info_t *nat_info; > + uint32_t mark; > + uint8_t conn_type; > }; > > enum ct_update_res { > @@ -67,6 +76,11 @@ enum ct_update_res { > CT_UPDATE_NEW, > }; > > +enum ct_conn_type { > + CT_CONN_TYPE_DEFAULT, > + CT_CONN_TYPE_UN_NAT, > +}; > + > struct ct_l4_proto { > struct conn *(*new_conn)(struct conntrack_bucket *, struct dp_packet *pkt, > long long now); > diff --git a/lib/conntrack.c b/lib/conntrack.c > index d0e106f..49760c0 100644 > --- a/lib/conntrack.c > +++ b/lib/conntrack.c > @@ -76,6 +76,20 @@ static void set_label(struct dp_packet *, struct conn *, > const struct ovs_key_ct_labels *mask); > static void *clean_thread_main(void *f_); > > +static struct nat_conn_key_node * > +nat_conn_keys_lookup(struct hmap *nat_conn_keys, > + const struct conn_key *key, > + uint32_t basis); > + > +static void > +nat_conn_keys_remove(struct hmap *nat_conn_keys, > + const struct conn_key *key, > + uint32_t basis); > + > +static bool > +nat_select_range_tuple(struct conntrack *ct, const struct conn *conn, > + struct conn *nat_conn); > + > static struct ct_l4_proto *l4_protos[] = { > [IPPROTO_TCP] = &ct_proto_tcp, > [IPPROTO_UDP] = &ct_proto_other, > @@ -90,7 +104,7 @@ long long ct_timeout_val[] = { > }; > > /* If the total number of connections goes above this value, no new connections > - * are accepted */ > + * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */ > #define DEFAULT_N_CONN_LIMIT 3000000 > > /* Initializes the connection tracker 'ct'. The caller is responsible for > @@ -101,6 +115,11 @@ conntrack_init(struct conntrack *ct) > unsigned i, j; > long long now = time_msec(); > > + ct_rwlock_init(&ct->nat_resources_lock); > + ct_rwlock_wrlock(&ct->nat_resources_lock); > + hmap_init(&ct->nat_conn_keys); > + ct_rwlock_unlock(&ct->nat_resources_lock); > + > for (i = 0; i < CONNTRACK_BUCKETS; i++) { > struct conntrack_bucket *ctb = &ct->buckets[i]; > > @@ -139,13 +158,24 @@ conntrack_destroy(struct conntrack *ct) > ovs_mutex_destroy(&ctb->cleanup_mutex); > ct_lock_lock(&ctb->lock); > HMAP_FOR_EACH_POP(conn, node, &ctb->connections) { > - atomic_count_dec(&ct->n_conn); > + if (conn->conn_type == CT_CONN_TYPE_DEFAULT) { > + atomic_count_dec(&ct->n_conn); > + } > delete_conn(conn); > } > hmap_destroy(&ctb->connections); > ct_lock_unlock(&ctb->lock); > ct_lock_destroy(&ctb->lock); > } > + ct_rwlock_wrlock(&ct->nat_resources_lock); > + struct nat_conn_key_node *nat_conn_key_node; > + HMAP_FOR_EACH_POP(nat_conn_key_node, node, &ct->nat_conn_keys) { > + free(nat_conn_key_node); > + } > + hmap_destroy(&ct->nat_conn_keys); > + ct_rwlock_unlock(&ct->nat_resources_lock); > + ct_rwlock_destroy(&ct->nat_resources_lock); > + > } > > static unsigned hash_to_bucket(uint32_t hash) > @@ -158,29 +188,186 @@ static unsigned hash_to_bucket(uint32_t hash) > } > > static void > -write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone, > - uint32_t mark, ovs_u128 label) > +write_ct_md(struct dp_packet *pkt, uint16_t zone, uint32_t mark, > + ovs_u128 label) > { > - pkt->md.ct_state = state | CS_TRACKED; > + pkt->md.ct_state |= CS_TRACKED; > pkt->md.ct_zone = zone; > pkt->md.ct_mark = mark; > pkt->md.ct_label = label; > } > > +static void > +nat_packet(struct dp_packet *pkt, const struct conn *conn) > +{ > + if (conn->nat_info->nat_action & NAT_ACTION_SRC) { > + pkt->md.ct_state |= CS_SRC_NAT; > + if (conn->key.dl_type == htons(ETH_TYPE_IP)) { > + struct ip_header *nh = dp_packet_l3(pkt); > + packet_set_ipv4_addr(pkt, &nh->ip_src, > + conn->rev_key.dst.addr.ipv4_aligned); > + } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) { > + struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); > + struct in6_addr ipv6_addr; > + memcpy(&ipv6_addr, conn->rev_key.dst.addr.ipv6.be32, > + sizeof ipv6_addr); > + packet_set_ipv6_addr(pkt, conn->key.nw_proto, > + nh6->ip6_src.be32, &ipv6_addr, true); > + } > + > + if (conn->key.nw_proto == IPPROTO_TCP) { > + struct tcp_header *th = dp_packet_l4(pkt); > + packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst); > + } else if (conn->key.nw_proto == IPPROTO_UDP) { > + struct udp_header *uh = dp_packet_l4(pkt); > + packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst); > + } > + } else if (conn->nat_info->nat_action & NAT_ACTION_DST) { > + pkt->md.ct_state |= CS_DST_NAT; > + > + if (conn->key.dl_type == htons(ETH_TYPE_IP)) { > + struct ip_header *nh = dp_packet_l3(pkt); > + packet_set_ipv4_addr(pkt, &nh->ip_dst, > + conn->rev_key.src.addr.ipv4_aligned); > + } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) { > + struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); > + > + struct in6_addr ipv6_addr; > + memcpy(&ipv6_addr, conn->rev_key.dst.addr.ipv6.be32, > + sizeof ipv6_addr); > + packet_set_ipv6_addr(pkt, conn->key.nw_proto, > + nh6->ip6_dst.be32, &ipv6_addr, true); > + } > + if (conn->key.nw_proto == IPPROTO_TCP) { > + struct tcp_header *th = dp_packet_l4(pkt); > + packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port); > + } else if (conn->key.nw_proto == IPPROTO_UDP) { > + struct udp_header *uh = dp_packet_l4(pkt); > + packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port); > + } > + } > +} > + > +static void > +un_nat_packet(struct dp_packet *pkt, const struct conn *conn) > +{ > + if (conn->nat_info->nat_action & NAT_ACTION_SRC) { > + pkt->md.ct_state |= CS_SRC_NAT; > + if (conn->key.dl_type == htons(ETH_TYPE_IP)) { > + struct ip_header *nh = dp_packet_l3(pkt); > + packet_set_ipv4_addr(pkt, &nh->ip_dst, > + conn->key.src.addr.ipv4_aligned); > + } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) { > + struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); > + struct in6_addr ipv6_addr; > + memcpy(&ipv6_addr, conn->key.src.addr.ipv6.be32, > + sizeof ipv6_addr); > + packet_set_ipv6_addr(pkt, conn->key.nw_proto, > + nh6->ip6_dst.be32, &ipv6_addr, true); > + } > + > + if (conn->key.nw_proto == IPPROTO_TCP) { > + struct tcp_header *th = dp_packet_l4(pkt); > + packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port); > + } else if (conn->key.nw_proto == IPPROTO_UDP) { > + struct udp_header *uh = dp_packet_l4(pkt); > + packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port); > + } > + } else if (conn->nat_info->nat_action & NAT_ACTION_DST) { > + pkt->md.ct_state |= CS_DST_NAT; > + if (conn->key.dl_type == htons(ETH_TYPE_IP)) { > + struct ip_header *nh = dp_packet_l3(pkt); > + packet_set_ipv4_addr(pkt, &nh->ip_src, > + conn->key.dst.addr.ipv4_aligned); > + } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) { > + struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); > + struct in6_addr ipv6_addr; > + memcpy(&ipv6_addr, conn->key.dst.addr.ipv6.be32, > + sizeof ipv6_addr); > + packet_set_ipv6_addr(pkt, conn->key.nw_proto, > + nh6->ip6_src.be32, &ipv6_addr, true); > + } > + > + if (conn->key.nw_proto == IPPROTO_TCP) { > + struct tcp_header *th = dp_packet_l4(pkt); > + packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst); > + } else if (conn->key.nw_proto == IPPROTO_UDP) { > + struct udp_header *uh = dp_packet_l4(pkt); > + packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst); > + } > + } > +} > + > +/* Typical usage of this helper is in non per-packet code; > + * this is because the bucket lock needs to be held for lookup > + * and a hash would have already been needed. Hence, this function > + * is just intended for code clarity. */ > +static struct conn * > +conn_lookup(struct conntrack *ct, struct conn_key *key, long long now) > +{ > + struct conn_lookup_ctx ctx; > + ctx.conn = NULL; > + ctx.key = *key; > + ctx.hash = conn_key_hash(key, ct->hash_basis); > + unsigned bucket = hash_to_bucket(ctx.hash); > + conn_key_lookup(&ct->buckets[bucket], &ctx, now); > + return ctx.conn; > +} > + > +static void > +nat_clean(struct conntrack *ct, struct conn *conn, > + struct conntrack_bucket *ctb) > + OVS_REQUIRES(ctb->lock) > +{ > + long long now = time_msec(); > + ct_rwlock_wrlock(&ct->nat_resources_lock); > + nat_conn_keys_remove(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis); > + ct_rwlock_unlock(&ct->nat_resources_lock); > + ct_lock_unlock(&ctb->lock); > + > + uint32_t hash_rev_conn = conn_key_hash(&conn->rev_key, ct->hash_basis); > + unsigned bucket_rev_conn = hash_to_bucket(hash_rev_conn); > + > + ct_lock_lock(&ct->buckets[bucket_rev_conn].lock); > + ct_rwlock_wrlock(&ct->nat_resources_lock); > + > + struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now); > + > + struct nat_conn_key_node *nat_conn_key_node = > + nat_conn_keys_lookup(&ct->nat_conn_keys, &conn->rev_key, > + ct->hash_basis); > + > + /* In the unlikely event, rev conn was recreated, then skip > + * rev_conn cleanup. */ > + if ((rev_conn) && (!nat_conn_key_node || > + memcmp(&nat_conn_key_node->value, &rev_conn->rev_key, > + sizeof nat_conn_key_node->value))) { > + hmap_remove(&ct->buckets[bucket_rev_conn].connections, > + &rev_conn->node); > + free(rev_conn); > + } > + delete_conn(conn); > + > + ct_rwlock_unlock(&ct->nat_resources_lock); > + ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock); > + ct_lock_lock(&ctb->lock); > + > +} > + > static struct conn * > conn_not_found(struct conntrack *ct, struct dp_packet *pkt, > - struct conn_lookup_ctx *ctx, uint16_t *state, bool commit, > - long long now) > + struct conn_lookup_ctx *ctx, bool commit, long long now, > + const struct nat_action_info_t *nat_action_info, > + struct conn *conn_for_un_nat_copy) > { > unsigned bucket = hash_to_bucket(ctx->hash); > struct conn *nc = NULL; > > if (!valid_new(pkt, &ctx->key)) { > - *state |= CS_INVALID; > + pkt->md.ct_state = CS_INVALID; > return nc; > } > - > - *state |= CS_NEW; > + pkt->md.ct_state = CS_NEW; > > if (commit) { > unsigned int n_conn_limit; > @@ -193,71 +380,213 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, > } > > nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now); > + ctx->conn = nc; > + memcpy(&nc->rev_key, &nc->key, sizeof nc->rev_key); > + conn_key_reverse(&nc->rev_key); > > - memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key); > + if (nat_action_info) { > + nc->nat_info = xzalloc(sizeof *nat_action_info); > + memcpy(nc->nat_info, nat_action_info, sizeof *nc->nat_info); > + ct_rwlock_wrlock(&ct->nat_resources_lock); > > - conn_key_reverse(&nc->rev_key); > + bool nat_res = nat_select_range_tuple(ct, nc, > + conn_for_un_nat_copy); > + > + if (!nat_res) { > + free(nc->nat_info); > + nc->nat_info = NULL; > + free (nc); > + ct_rwlock_unlock(&ct->nat_resources_lock); > + return NULL; > + } > + > + if (conn_for_un_nat_copy && > + nc->conn_type == CT_CONN_TYPE_DEFAULT) { > + *nc = *conn_for_un_nat_copy; > + conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT; > + } > + ct_rwlock_unlock(&ct->nat_resources_lock); > + > + nat_packet(pkt, nc); > + } > hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash); > atomic_count_inc(&ct->n_conn); > } > - > return nc; > } > > -static struct conn * > -process_one(struct conntrack *ct, struct dp_packet *pkt, > - struct conn_lookup_ctx *ctx, uint16_t zone, > - bool commit, long long now) > +static bool > +conn_update_state(struct conntrack *ct, struct dp_packet *pkt, > + struct conn_lookup_ctx *ctx, struct conn **conn, > + long long now, unsigned bucket) > + OVS_REQUIRES(ct->buckets[bucket].lock) > { > - unsigned bucket = hash_to_bucket(ctx->hash); > - struct conn *conn = ctx->conn; > - uint16_t state = 0; > + bool create_new_conn = false; > > - if (conn) { > - if (ctx->related) { > - state |= CS_RELATED; > + if (ctx->related) { > + pkt->md.ct_state |= CS_RELATED; > + if (ctx->reply) { > + pkt->md.ct_state |= CS_REPLY_DIR; > + } > + } else { > + enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket], > + pkt, ctx->reply, now); > + > + switch (res) { > + case CT_UPDATE_VALID: > + pkt->md.ct_state |= CS_ESTABLISHED; > + pkt->md.ct_state &= ~CS_NEW; > if (ctx->reply) { > - state |= CS_REPLY_DIR; > + pkt->md.ct_state |= CS_REPLY_DIR; > + } > + break; > + case CT_UPDATE_INVALID: > + pkt->md.ct_state = CS_INVALID; > + break; > + case CT_UPDATE_NEW: > + ovs_list_remove(&(*conn)->exp_node); > + hmap_remove(&ct->buckets[bucket].connections, &(*conn)->node); > + atomic_count_dec(&ct->n_conn); > + if ((*conn)->nat_info) { > + nat_clean(ct, *conn, &ct->buckets[bucket]); > + } else { > + delete_conn(*conn); > } > + create_new_conn = true; > + break; > + default: > + OVS_NOT_REACHED(); > + } > + } > + return create_new_conn; > +} > + > +static void > +create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy, > + long long now) > +{ > + struct conn *nc = xzalloc(sizeof *nc); > + memcpy(nc, conn_for_un_nat_copy, sizeof *nc); > + nc->key = conn_for_un_nat_copy->rev_key; > + nc->rev_key = conn_for_un_nat_copy->key; > + uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis); > + unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash); > + ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock); > + ct_rwlock_rdlock(&ct->nat_resources_lock); > + > + struct conn *rev_conn = conn_lookup(ct, &nc->key, now); > + > + struct nat_conn_key_node *nat_conn_key_node = > + nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis); > + if (nat_conn_key_node && !memcmp(&nat_conn_key_node->value, > + &nc->rev_key, sizeof nat_conn_key_node->value) && !rev_conn) { > + > + hmap_insert(&ct->buckets[un_nat_conn_bucket].connections, > + &nc->node, un_nat_hash); > } else { > - enum ct_update_res res; > + free(nc); > + } > + ct_rwlock_unlock(&ct->nat_resources_lock); > + ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock); > +} > > - res = conn_update(conn, &ct->buckets[bucket], pkt, > - ctx->reply, now); > +static void > +handle_nat(struct dp_packet *pkt, struct conn *conn, > + uint16_t zone, bool reply) > +{ > + if ((conn->nat_info) && > + (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) || > + (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) && > + zone != pkt->md.ct_zone))){ > > - switch (res) { > - case CT_UPDATE_VALID: > - state |= CS_ESTABLISHED; > - if (ctx->reply) { > - state |= CS_REPLY_DIR; > - } > - break; > - case CT_UPDATE_INVALID: > - state |= CS_INVALID; > - break; > - case CT_UPDATE_NEW: > - ovs_list_remove(&conn->exp_node); > - hmap_remove(&ct->buckets[bucket].connections, &conn->node); > - atomic_count_dec(&ct->n_conn); > - delete_conn(conn); > - conn = conn_not_found(ct, pkt, ctx, &state, commit, now); > - break; > - default: > - OVS_NOT_REACHED(); > + if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) { > + pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT); > + } > + if (reply) { > + un_nat_packet(pkt, conn); > + } else { > + nat_packet(pkt, conn); > + } > + } > +} > + > +static void > +process_one(struct conntrack *ct, struct dp_packet *pkt, > + struct conn_lookup_ctx *ctx, uint16_t zone, > + bool commit, long long now, const uint32_t *setmark, > + const struct ovs_key_ct_labels *setlabel, > + const struct nat_action_info_t *nat_action_info) > +{ > + struct conn *conn; > + unsigned bucket = hash_to_bucket(ctx->hash); > + ct_lock_lock(&ct->buckets[bucket].lock); > + conn_key_lookup(&ct->buckets[bucket], ctx, now); > + conn = ctx->conn; > + struct conn conn_for_un_nat_copy; > + memset(&conn_for_un_nat_copy, 0, sizeof conn_for_un_nat_copy); This memset seems expensive for non nat cases. Is there a way to do it only when it's necessary? The variable and usage should be moved to where it is used. There is no need for memset, just a flag assignment. > + > + if (OVS_LIKELY(conn)) { > + if (conn->conn_type == CT_CONN_TYPE_UN_NAT) { > + ctx->reply = 1; > + > + struct conn_lookup_ctx ctx2; > + ctx2.conn = NULL; > + ctx2.key = conn->rev_key; > + ctx2.hash = conn_key_hash(&conn->rev_key, ct->hash_basis); > + > + ct_lock_unlock(&ct->buckets[bucket].lock); > + bucket = hash_to_bucket(ctx2.hash); > + > + ct_lock_lock(&ct->buckets[bucket].lock); > + conn_key_lookup(&ct->buckets[bucket], &ctx2, now); > + > + if (ctx2.conn) { > + conn = ctx2.conn; > + } else { > + /* It is a race condition where conn has timed out and removed > + * between unlock of the rev_conn and lock of the forward conn; > + * nothing to do. */ > + ct_lock_unlock(&ct->buckets[bucket].lock); > + return; > } > } > + } > + > + bool create_new_conn = false; > + if (OVS_LIKELY(conn)) { > + create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket); > + if (nat_action_info && !create_new_conn) { > + handle_nat(pkt, conn, zone, ctx->reply); > + } > } else { > if (ctx->related) { > - state |= CS_INVALID; > + pkt->md.ct_state = CS_INVALID; > } else { > - conn = conn_not_found(ct, pkt, ctx, &state, commit, now); > + create_new_conn = true; > } > } > > - write_ct_md(pkt, state, zone, conn ? conn->mark : 0, > + if (OVS_UNLIKELY(create_new_conn)) { > + conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info, > + &conn_for_un_nat_copy); > + } > + > + write_ct_md(pkt, zone, conn ? conn->mark : 0, > conn ? conn->label : OVS_U128_ZERO); > > - return conn; > + if (conn && setmark) { > + set_mark(pkt, conn, setmark[0], setmark[1]); > + } > + > + if (conn && setlabel) { > + set_label(pkt, conn, &setlabel[0], &setlabel[1]); > + } > + > + ct_lock_unlock(&ct->buckets[bucket].lock); > + > + if (conn_for_un_nat_copy.conn_type == CT_CONN_TYPE_UN_NAT) { > + create_un_nat_conn(ct, &conn_for_un_nat_copy, now); > + } > } > > /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All > @@ -274,7 +603,7 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, > const uint32_t *setmark, > const struct ovs_key_ct_labels *setlabel, > const char *helper, > - const struct nat_action_info_t *nat_action_info OVS_UNUSED) > + const struct nat_action_info_t *nat_action_info) > { > struct dp_packet **pkts = pkt_batch->packets; > size_t cnt = pkt_batch->count; > @@ -297,27 +626,12 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, > for (i = 0; i < cnt; i++) { > > if (!conn_key_extract(ct, pkts[i], dl_type, &ctxs[i], zone)) { > - write_ct_md(pkts[i], CS_INVALID, zone, 0, OVS_U128_ZERO); > + pkts[i]->md.ct_state = CS_INVALID; > + write_ct_md(pkts[i], zone, 0, OVS_U128_ZERO); > continue; > } > - > - unsigned bucket = hash_to_bucket(ctxs[i].hash); > - struct conntrack_bucket *ctb = &ct->buckets[bucket]; > - ct_lock_lock(&ctb->lock); > - conn_key_lookup(ctb, &ctxs[i], now); > - > - struct conn *conn = process_one(ct, pkts[i], &ctxs[i], zone, > - commit, now); > - > - if (conn && setmark) { > - set_mark(pkts[i], conn, setmark[0], setmark[1]); > - } > - > - if (conn && setlabel) { > - set_label(pkts[i], conn, &setlabel[0], &setlabel[1]); > - } > - > - ct_lock_unlock(&ctb->lock); > + process_one(ct, pkts[i], &ctxs[i], zone, commit, > + now, setmark, setlabel, nat_action_info); > } > > return 0; > @@ -346,6 +660,7 @@ set_label(struct dp_packet *pkt, struct conn *conn, > | (pkt->md.ct_label.u64.hi & ~(m.u64.hi)); > conn->label = pkt->md.ct_label; > } > + > > /* Delete the expired connections from 'ctb', up to 'limit'. Returns the > * earliest expiration time among the remaining connections in 'ctb'. Returns > @@ -363,20 +678,27 @@ sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now, > > for (i = 0; i < N_CT_TM; i++) { > LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) { > - if (!conn_expired(conn, now) || count >= limit) { > - min_expiration = MIN(min_expiration, conn->expiration); > - if (count >= limit) { > - /* Do not check other lists. */ > - COVERAGE_INC(conntrack_long_cleanup); > - return min_expiration; > + if (conn->conn_type == CT_CONN_TYPE_DEFAULT) { > + if (!conn_expired(conn, now) || count >= limit) { > + min_expiration = MIN(min_expiration, conn->expiration); > + if (count >= limit) { > + /* Do not check other lists. */ > + COVERAGE_INC(conntrack_long_cleanup); > + return min_expiration; > + } > + break; > } > - break; > + ovs_list_remove(&conn->exp_node); > + hmap_remove(&ctb->connections, &conn->node); > + if (conn->nat_info) { > + nat_clean(ct, conn, ctb); > + } else { > + delete_conn(conn); > + } > + > + atomic_count_dec(&ct->n_conn); > + count++; > } > - ovs_list_remove(&conn->exp_node); > - hmap_remove(&ctb->connections, &conn->node); > - atomic_count_dec(&ct->n_conn); > - delete_conn(conn); > - count++; > } > } > > @@ -747,7 +1069,6 @@ extract_l4_icmp(struct conn_key *key, const void *data, size_t size, > return false; > } > > - /* pf doesn't do this, but it seems a good idea */ > if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned > || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) { > return false; > @@ -971,7 +1292,6 @@ conn_key_hash(const struct conn_key *key, uint32_t basis) > > hsrc = hdst = basis; > > - /* Hash the source and destination tuple */ > for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) { > hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]); > hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]); > @@ -998,6 +1318,275 @@ conn_key_reverse(struct conn_key *key) > key->dst = tmp; > } > > +static uint32_t > +nat_ipv6_addrs_delta(struct in6_addr *ipv6_aligned_min, > + struct in6_addr *ipv6_aligned_max) > +{ > + uint64_t diff = 0; > + uint8_t *ipv6_min_hi = &ipv6_aligned_min->s6_addr[0]; > + uint8_t *ipv6_min_lo = &ipv6_aligned_min->s6_addr[0] + sizeof(uint64_t); > + uint8_t *ipv6_max_hi = &ipv6_aligned_max->s6_addr[0]; > + uint8_t *ipv6_max_lo = &ipv6_aligned_max->s6_addr[0] + sizeof(uint64_t); > + > + ovs_be64 addr6_64_min_hi; > + ovs_be64 addr6_64_min_lo; > + memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi); > + memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo); > + ovs_be64 addr6_64_max_hi; > + ovs_be64 addr6_64_max_lo; > + memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi); > + memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo); > + > + if ((addr6_64_min_hi == addr6_64_max_hi) && > + (ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo))){ > + diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo); > + } else if ((ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi)) && > + (ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo))) { > + diff = UINT64_MAX - (ntohll(addr6_64_min_lo) - > + ntohll(addr6_64_max_lo) - 1); > + } else { > + /* Limit address delta supported to 32 bits or 4 billion approximately. > + * Possibly, this should be visible to the user through a datapath > + * support check, however the practical impact is probably nil. */ > + diff = 0xfffffffe; > + } > + if (diff > 0xfffffffe) { > + diff = 0xfffffffe; > + } > + return (uint32_t)diff; > +} > + > +/* This function must be used in tandem with nat_ipv6_addrs_delta(), which > + * restricts the input parameters. */ > +static void > +nat_ipv6_addr_increment(struct in6_addr *ipv6_aligned, uint32_t increment) > +{ > + uint8_t *ipv6_hi = &ipv6_aligned->s6_addr[0]; > + uint8_t *ipv6_lo = &ipv6_aligned->s6_addr[0] + sizeof(ovs_be64); > + ovs_be64 addr6_64_hi; > + ovs_be64 addr6_64_lo; > + memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi); > + memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo); > + > + if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) { > + addr6_64_lo = htonll(increment + ntohll(addr6_64_lo)); > + } else if (addr6_64_hi != UINT64_MAX) { > + addr6_64_hi = htonll(1 + ntohll(addr6_64_hi)); > + addr6_64_lo = htonll(increment - (UINT64_MAX - > + ntohll(addr6_64_lo) + 1)); > + } else { > + OVS_NOT_REACHED(); > + } > + > + memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi); > + memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo); > + > + return; > +} > + > +static uint32_t > +nat_range_hash(const struct conn *conn, uint32_t basis) > +{ > + uint32_t hash = basis; > + int i; > + uint32_t port; > + > + for (i = 0; > + i < sizeof(conn->nat_info->min_addr) / sizeof(uint32_t); > + i++) { > + hash = hash_add(hash, ((uint32_t *) &conn->nat_info->min_addr)[i]); > + hash = hash_add(hash, ((uint32_t *) &conn->nat_info->max_addr)[i]); > + } > + > + memcpy(&port, &conn->nat_info->min_port, sizeof port); > + hash = hash_add(hash, port); > + > + for (i = 0; i < sizeof(conn->key.src.addr) / sizeof(uint32_t); i++) { > + hash = hash_add(hash, ((uint32_t *) &conn->key.src)[i]); > + hash = hash_add(hash, ((uint32_t *) &conn->key.dst)[i]); > + } > + > + port = (OVS_FORCE uint32_t) conn->key.src.port; > + hash = hash_add(hash, port); > + port = (OVS_FORCE uint32_t) conn->key.dst.port; > + hash = hash_add(hash, port); > + > + uint32_t dl_type_for_hash = (OVS_FORCE uint32_t) conn->key.dl_type; > + hash = hash_add(hash, dl_type_for_hash); > + uint32_t nw_proto_for_hash = (uint32_t) conn->key.nw_proto; > + hash = hash_add(hash, nw_proto_for_hash); > + uint32_t zone_for_hash = (uint32_t) conn->key.zone; > + hash = hash_add(hash, zone_for_hash); > + return hash; > +} > + > +static bool > +nat_select_range_tuple(struct conntrack *ct, const struct conn *conn, > + struct conn *nat_conn) > +{ > +#define MIN_NAT_EPHEMERAL_PORT 1024 > +#define MAX_NAT_EPHEMERAL_PORT 65535 > + > + uint16_t min_port; > + uint16_t max_port; > + uint16_t first_port; > + > + uint32_t hash = nat_range_hash(conn, ct->hash_basis); > + > + if ((conn->nat_info->nat_action & NAT_ACTION_SRC) && > + (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) { > + min_port = ntohs(conn->key.src.port); > + max_port = ntohs(conn->key.src.port); > + first_port = min_port; > + } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) && > + (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) { > + min_port = ntohs(conn->key.dst.port); > + max_port = ntohs(conn->key.dst.port); > + first_port = min_port; > + } else { > + uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port; > + uint32_t port_index = hash % (deltap + 1); > + first_port = conn->nat_info->min_port + port_index; > + min_port = conn->nat_info->min_port; > + max_port = conn->nat_info->max_port; > + } > + > + uint32_t deltaa = 0; > + uint32_t address_index; > + struct ct_addr ct_addr; > + memset(&ct_addr, 0, sizeof ct_addr); > + struct ct_addr max_ct_addr; > + memset(&max_ct_addr, 0, sizeof max_ct_addr); > + max_ct_addr = conn->nat_info->max_addr; > + > + if (conn->key.dl_type == htons(ETH_TYPE_IP)) { > + deltaa = ntohl(conn->nat_info->max_addr.ipv4_aligned) - > + ntohl(conn->nat_info->min_addr.ipv4_aligned); > + address_index = hash % (deltaa + 1); > + ct_addr.ipv4_aligned = htonl( > + ntohl(conn->nat_info->min_addr.ipv4_aligned) + address_index); > + } else { > + deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6_aligned, > + &conn->nat_info->max_addr.ipv6_aligned); > + /* deltaa must be within 32 bits for full hash coverage. A 64 or > + * 128 bit hash is unnecessary and hence not used here. Most code > + * is kept common with V4; nat_ipv6_addrs_delta() will do the > + * enforcement via max_ct_addr. */ > + max_ct_addr = conn->nat_info->min_addr; > + nat_ipv6_addr_increment(&max_ct_addr.ipv6_aligned, deltaa); > + > + address_index = hash % (deltaa + 1); > + ct_addr.ipv6_aligned = conn->nat_info->min_addr.ipv6_aligned; > + nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, address_index); > + } > + > + uint16_t port = first_port; > + bool all_ports_tried = false; > + bool original_ports_tried = false; > + struct ct_addr first_addr = ct_addr; > + *nat_conn = *conn; > + > + while (true) { > + if (conn->nat_info->nat_action & NAT_ACTION_SRC) { > + nat_conn->rev_key.dst.addr = ct_addr; > + nat_conn->rev_key.dst.port = htons(port); > + } else { > + nat_conn->rev_key.src.addr = ct_addr; > + nat_conn->rev_key.src.port = htons(port); > + } > + > + struct nat_conn_key_node *nat_conn_key_node = > + nat_conn_keys_lookup(&ct->nat_conn_keys, &nat_conn->rev_key, > + ct->hash_basis); > + > + if (!nat_conn_key_node) { > + struct nat_conn_key_node *nat_conn_key = > + xzalloc(sizeof *nat_conn_key); > + memcpy(&nat_conn_key->key, &nat_conn->rev_key, > + sizeof nat_conn_key->key); > + memcpy(&nat_conn_key->value, &nat_conn->key, > + sizeof nat_conn_key->value); > + uint32_t nat_conn_key_hash = conn_key_hash(&nat_conn_key->key, > + ct->hash_basis); > + hmap_insert(&ct->nat_conn_keys, &nat_conn_key->node, > + nat_conn_key_hash); > + return true; > + } else if (!all_ports_tried) { > + if (min_port == max_port) { > + all_ports_tried = true; > + } else if (port == max_port) { > + port = min_port; > + } else { > + port++; > + } > + if (port == first_port) { > + all_ports_tried = true; > + } > + } else { > + if (memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) { > + if (conn->key.dl_type == htons(ETH_TYPE_IP)) { > + ct_addr.ipv4_aligned = htonl( > + ntohl(ct_addr.ipv4_aligned) + 1); > + } else { > + nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, 1); > + } > + } else { > + ct_addr = conn->nat_info->min_addr; > + } > + if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) { > + if (!original_ports_tried) { > + original_ports_tried = true; > + ct_addr = conn->nat_info->min_addr; > + min_port = MIN_NAT_EPHEMERAL_PORT; > + max_port = MAX_NAT_EPHEMERAL_PORT; > + } else { > + break; > + } > + } > + first_port = min_port; > + port = first_port; > + all_ports_tried = false; > + } > + } > + return false; > +} > + > +static struct nat_conn_key_node * > +nat_conn_keys_lookup(struct hmap *nat_conn_keys, > + const struct conn_key *key, > + uint32_t basis) > +{ > + struct nat_conn_key_node *nat_conn_key_node; > + uint32_t nat_conn_key_hash = conn_key_hash(key, basis); > + > + HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash, > + nat_conn_keys) { > + if (!memcmp(&nat_conn_key_node->key, key, > + sizeof nat_conn_key_node->key)) { > + return nat_conn_key_node; > + } > + } > + return NULL; > +} > + > +static void > +nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key, > + uint32_t basis) > +{ > + struct nat_conn_key_node *nat_conn_key_node; > + uint32_t nat_conn_key_hash = conn_key_hash(key, basis); > + > + HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash, > + nat_conn_keys) { > + if (!memcmp(&nat_conn_key_node->key, key, > + sizeof nat_conn_key_node->key)) { > + hmap_remove(nat_conn_keys, &nat_conn_key_node->node); > + free(nat_conn_key_node); > + return; > + } > + } > +} > + > static void > conn_key_lookup(struct conntrack_bucket *ctb, > struct conn_lookup_ctx *ctx, > @@ -1009,13 +1598,13 @@ conn_key_lookup(struct conntrack_bucket *ctb, > ctx->conn = NULL; > > HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) { > - if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key)) > + if (!memcmp(&conn->key, &ctx->key, sizeof conn->key) > && !conn_expired(conn, now)) { > ctx->conn = conn; > ctx->reply = false; > break; > } > - if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key)) > + if (!memcmp(&conn->rev_key, &ctx->key, sizeof conn->rev_key) > && !conn_expired(conn, now)) { > ctx->conn = conn; > ctx->reply = true; > @@ -1035,7 +1624,10 @@ conn_update(struct conn *conn, struct conntrack_bucket *ctb, > static bool > conn_expired(struct conn *conn, long long now) > { > - return now >= conn->expiration; > + if (conn->conn_type == CT_CONN_TYPE_DEFAULT) { > + return now >= conn->expiration; > + } > + return false; > } > > static bool > @@ -1062,6 +1654,7 @@ new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt, > static void > delete_conn(struct conn *conn) > { > + free(conn->nat_info); > free(conn); > } > > @@ -1114,7 +1707,7 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry, > entry->zone = conn->key.zone; > entry->mark = conn->mark; > > - memcpy(&entry->labels, &conn->label, sizeof(entry->labels)); > + memcpy(&entry->labels, &conn->label, sizeof entry->labels); > /* Not implemented yet */ > entry->timestamp.start = 0; > entry->timestamp.stop = 0; > @@ -1161,7 +1754,8 @@ conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry) > break; > } > INIT_CONTAINER(conn, node, node); > - if (!dump->filter_zone || conn->key.zone == dump->zone) { > + if ((!dump->filter_zone || conn->key.zone == dump->zone) && > + (conn->conn_type != CT_CONN_TYPE_UN_NAT)){ > conn_to_ct_dpif_entry(conn, entry, now); > break; > } > @@ -1196,15 +1790,19 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone) > > ct_lock_lock(&ct->buckets[i].lock); > HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) { > - if (!zone || *zone == conn->key.zone) { > + if ((!zone || *zone == conn->key.zone) && > + (conn->conn_type == CT_CONN_TYPE_DEFAULT)) { > ovs_list_remove(&conn->exp_node); > hmap_remove(&ct->buckets[i].connections, &conn->node); > atomic_count_dec(&ct->n_conn); > - delete_conn(conn); > + if (conn->nat_info) { > + nat_clean(ct, conn, &ct->buckets[i]); > + } else { > + delete_conn(conn); > + } > } > } > ct_lock_unlock(&ct->buckets[i].lock); > } > - > return 0; > } > diff --git a/lib/conntrack.h b/lib/conntrack.h > index 288808b..74a43b5 100644 > --- a/lib/conntrack.h > +++ b/lib/conntrack.h > @@ -121,6 +121,10 @@ struct OVS_LOCKABLE ct_lock { > struct ovs_mutex lock; > }; > > +struct OVS_LOCKABLE ct_rwlock { > + struct ovs_rwlock lock; > +}; > + > static inline void ct_lock_init(struct ct_lock *lock) > { > ovs_mutex_init_adaptive(&lock->lock); > @@ -144,6 +148,38 @@ static inline void ct_lock_destroy(struct ct_lock *lock) > { > ovs_mutex_destroy(&lock->lock); > } > + > +static inline void ct_rwlock_init(struct ct_rwlock *lock) > +{ > + ovs_rwlock_init(&lock->lock); > +} > + > +static inline void ct_rwlock_wrlock(struct ct_rwlock *lock) > + OVS_ACQUIRES(lock) > + OVS_NO_THREAD_SAFETY_ANALYSIS > +{ > + ovs_rwlock_wrlock(&lock->lock); > +} > + > +static inline void ct_rwlock_rdlock(struct ct_rwlock *lock) > + OVS_ACQUIRES(lock) > + OVS_NO_THREAD_SAFETY_ANALYSIS > +{ > + ovs_rwlock_rdlock(&lock->lock); > +} > + > +static inline void ct_rwlock_unlock(struct ct_rwlock *lock) > + OVS_RELEASES(lock) > + OVS_NO_THREAD_SAFETY_ANALYSIS > +{ > + ovs_rwlock_unlock(&lock->lock); > +} > + > +static inline void ct_rwlock_destroy(struct ct_rwlock *lock) > +{ > + ovs_rwlock_destroy(&lock->lock); > +} > + > > /* Timeouts: all the possible timeout states passed to update_expiration() > * are listed here. The name will be prefix by CT_TM_ and the value is in > @@ -226,6 +262,16 @@ struct conntrack { > /* Connections limit. When this limit is reached, no new connection > * will be accepted. */ > atomic_uint n_conn_limit; > + > + /* The following resources are referenced during nat connection > + * creation and deletion. */ > + struct hmap nat_conn_keys OVS_GUARDED; > + /* This lock is used during NAT connection creation and deletion; > + * it is taken after a bucket lock and given back before that > + * bucket unlock. > + */ > + struct ct_rwlock nat_resources_lock; > + > }; > > #endif /* conntrack.h */ > -- > 1.9.1 > > _______________________________________________ > dev mailing list > d...@openvswitch.org > https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwICAg&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=uKbzOlSG64hQ4MzEKR9BqtYcvsf5x2Z7_WghMmb495w&s=eLkW6WShFy9UmKeK90cFXIbHB-uYmY0M7RzXpQhc000&e= _______________________________________________ dev mailing list d...@openvswitch.org https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwICAg&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=uKbzOlSG64hQ4MzEKR9BqtYcvsf5x2Z7_WghMmb495w&s=eLkW6WShFy9UmKeK90cFXIbHB-uYmY0M7RzXpQhc000&e= _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev