This patch introduces NAT support for the userspace datapath. The conntrack module changes are in this patch.
The per packet scope of lookups for NAT and un_NAT is at the bucket level rather than global. One hash table is introduced to support create/delete handling. The create/delete events may be further optimized, if the need becomes clear. Some NAT options with limited utility (persistent, random) are not supported yet, but will be supported in a later patch. Signed-off-by: Darrell Ball <dlu...@gmail.com> --- lib/conntrack-private.h | 16 +- lib/conntrack.c | 740 ++++++++++++++++++++++++++++++++++++++++++------ lib/conntrack.h | 42 +++ 3 files changed, 714 insertions(+), 84 deletions(-) diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h index 493865f..b71af37 100644 --- a/lib/conntrack-private.h +++ b/lib/conntrack-private.h @@ -51,14 +51,23 @@ struct conn_key { uint16_t zone; }; +struct nat_conn_key_node { + struct hmap_node node; + struct conn_key key; + struct conn_key value; +}; + struct conn { struct conn_key key; struct conn_key rev_key; long long expiration; struct ovs_list exp_node; struct hmap_node node; - uint32_t mark; ovs_u128 label; + /* XXX: consider flattening. */ + struct nat_action_info_t *nat_info; + uint32_t mark; + uint8_t conn_type; }; enum ct_update_res { @@ -67,6 +76,11 @@ enum ct_update_res { CT_UPDATE_NEW, }; +enum ct_conn_type { + CT_CONN_TYPE_DEFAULT, + CT_CONN_TYPE_UN_NAT, +}; + struct ct_l4_proto { struct conn *(*new_conn)(struct conntrack_bucket *, struct dp_packet *pkt, long long now); diff --git a/lib/conntrack.c b/lib/conntrack.c index bae42a3..c629541 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -76,6 +76,20 @@ static void set_label(struct dp_packet *, struct conn *, const struct ovs_key_ct_labels *mask); static void *clean_thread_main(void *f_); +static struct nat_conn_key_node * +nat_conn_keys_lookup(struct hmap *nat_conn_keys, + const struct conn *conn, + uint32_t basis); + +static void +nat_conn_keys_remove(struct hmap *nat_conn_keys, + const struct conn *conn, + uint32_t basis); + +static bool +nat_select_range_tuple(struct conntrack *ct, const struct conn *conn, + struct conn *rev_conn); + static struct ct_l4_proto *l4_protos[] = { [IPPROTO_TCP] = &ct_proto_tcp, [IPPROTO_UDP] = &ct_proto_other, @@ -90,7 +104,7 @@ long long ct_timeout_val[] = { }; /* If the total number of connections goes above this value, no new connections - * are accepted */ + * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */ #define DEFAULT_N_CONN_LIMIT 3000000 /* Initializes the connection tracker 'ct'. The caller is responsible for @@ -101,6 +115,11 @@ conntrack_init(struct conntrack *ct) unsigned i, j; long long now = time_msec(); + ct_rwlock_init(&ct->nat_resources_lock); + ct_rwlock_wrlock(&ct->nat_resources_lock); + hmap_init(&ct->nat_conn_keys); + ct_rwlock_unlock(&ct->nat_resources_lock); + for (i = 0; i < CONNTRACK_BUCKETS; i++) { struct conntrack_bucket *ctb = &ct->buckets[i]; @@ -139,13 +158,24 @@ conntrack_destroy(struct conntrack *ct) ovs_mutex_destroy(&ctb->cleanup_mutex); ct_lock_lock(&ctb->lock); HMAP_FOR_EACH_POP(conn, node, &ctb->connections) { - atomic_count_dec(&ct->n_conn); + if (conn->conn_type == CT_CONN_TYPE_DEFAULT) { + atomic_count_dec(&ct->n_conn); + } delete_conn(conn); } hmap_destroy(&ctb->connections); ct_lock_unlock(&ctb->lock); ct_lock_destroy(&ctb->lock); } + ct_rwlock_wrlock(&ct->nat_resources_lock); + struct nat_conn_key_node *nat_conn_key_node; + HMAP_FOR_EACH_POP(nat_conn_key_node, node, &ct->nat_conn_keys) { + free(nat_conn_key_node); + } + hmap_destroy(&ct->nat_conn_keys); + ct_rwlock_unlock(&ct->nat_resources_lock); + ct_rwlock_destroy(&ct->nat_resources_lock); + } static unsigned hash_to_bucket(uint32_t hash) @@ -167,10 +197,175 @@ write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone, pkt->md.ct_label = label; } +static void +nat_packet(struct dp_packet *pkt, const struct conn *conn, uint16_t *state) +{ + if (conn->nat_info->nat_action & NAT_ACTION_SRC) { + *state |= CS_SRC_NAT; + if (conn->key.dl_type == htons(ETH_TYPE_IP)) { + struct ip_header *nh = dp_packet_l3(pkt); + packet_set_ipv4_addr(pkt, &nh->ip_src, + conn->rev_key.dst.addr.ipv4_aligned); + } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) { + struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); + struct in6_addr ipv6_addr; + memcpy(&ipv6_addr.s6_addr32, conn->rev_key.dst.addr.ipv6.be32, + sizeof ipv6_addr); + packet_set_ipv6_addr(pkt, conn->key.nw_proto, + nh6->ip6_src.be32, + &ipv6_addr, + true); + } + + if (conn->key.nw_proto == IPPROTO_TCP) { + struct tcp_header *th = dp_packet_l4(pkt); + packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst); + } else if (conn->key.nw_proto == IPPROTO_UDP) { + struct udp_header *uh = dp_packet_l4(pkt); + packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst); + } + } else if (conn->nat_info->nat_action & NAT_ACTION_DST) { + *state |= CS_DST_NAT; + + if (conn->key.dl_type == htons(ETH_TYPE_IP)) { + struct ip_header *nh = dp_packet_l3(pkt); + packet_set_ipv4_addr(pkt, &nh->ip_dst, + conn->rev_key.src.addr.ipv4_aligned); + } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) { + struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); + + struct in6_addr ipv6_addr; + memcpy(&ipv6_addr.s6_addr32, conn->rev_key.dst.addr.ipv6.be32, + sizeof ipv6_addr); + packet_set_ipv6_addr(pkt, conn->key.nw_proto, + nh6->ip6_dst.be32, + &ipv6_addr, + true); + } + if (conn->key.nw_proto == IPPROTO_TCP) { + struct tcp_header *th = dp_packet_l4(pkt); + packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port); + } else if (conn->key.nw_proto == IPPROTO_UDP) { + struct udp_header *uh = dp_packet_l4(pkt); + packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port); + } + } +} + +static void +un_nat_packet(struct dp_packet *pkt, const struct conn *conn, + uint16_t *state) +{ + if (conn->nat_info->nat_action & NAT_ACTION_SRC) { + *state |= CS_SRC_NAT; + if (conn->key.dl_type == htons(ETH_TYPE_IP)) { + struct ip_header *nh = dp_packet_l3(pkt); + packet_set_ipv4_addr(pkt, &nh->ip_dst, + conn->key.src.addr.ipv4_aligned); + } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) { + struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); + struct in6_addr ipv6_addr; + memcpy(&ipv6_addr, conn->key.src.addr.ipv6.be32, + sizeof ipv6_addr); + packet_set_ipv6_addr(pkt, conn->key.nw_proto, + nh6->ip6_dst.be32, + &ipv6_addr, + true); + } + + if (conn->key.nw_proto == IPPROTO_TCP) { + struct tcp_header *th = dp_packet_l4(pkt); + packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port); + } else if (conn->key.nw_proto == IPPROTO_UDP) { + struct udp_header *uh = dp_packet_l4(pkt); + packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port); + } + } else if (conn->nat_info->nat_action & NAT_ACTION_DST) { + *state |= CS_DST_NAT; + if (conn->key.dl_type == htons(ETH_TYPE_IP)) { + struct ip_header *nh = dp_packet_l3(pkt); + packet_set_ipv4_addr(pkt, &nh->ip_src, + conn->key.dst.addr.ipv4_aligned); + } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) { + struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); + struct in6_addr ipv6_addr; + memcpy(&ipv6_addr, conn->key.dst.addr.ipv6.be32, + sizeof ipv6_addr); + packet_set_ipv6_addr(pkt, conn->key.nw_proto, + nh6->ip6_src.be32, + &ipv6_addr, + true); + } + + if (conn->key.nw_proto == IPPROTO_TCP) { + struct tcp_header *th = dp_packet_l4(pkt); + packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst); + } else if (conn->key.nw_proto == IPPROTO_UDP) { + struct udp_header *uh = dp_packet_l4(pkt); + packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst); + } + } +} + +/* Typical usage of this helper is in non per-packet code; + * this is because the bucket lock needs to be held for lookup + * and a hash would have already been needed. Hence, this function + * is just intended for code clarity. */ +static struct conn * +conn_lookup(struct conntrack *ct, struct conn_key *key, long long now) +{ + struct conn_lookup_ctx ctx; + ctx.conn = NULL; + ctx.key = *key; + ctx.hash = conn_key_hash(key, ct->hash_basis); + unsigned bucket = hash_to_bucket(ctx.hash); + conn_key_lookup(&ct->buckets[bucket], &ctx, now); + return ctx.conn; +} + +static void +nat_clean(struct conntrack *ct, struct conn *conn, + struct conntrack_bucket *ctb) + OVS_REQUIRES(ctb->lock) +{ + long long now = time_msec(); + ct_rwlock_wrlock(&ct->nat_resources_lock); + nat_conn_keys_remove(&ct->nat_conn_keys, conn, ct->hash_basis); + ct_rwlock_unlock(&ct->nat_resources_lock); + ct_lock_unlock(&ctb->lock); + + uint32_t hash_rev_conn = conn_key_hash(&conn->rev_key, ct->hash_basis); + unsigned bucket_rev_conn = hash_to_bucket(hash_rev_conn); + + ct_lock_lock(&ct->buckets[bucket_rev_conn].lock); + ct_rwlock_wrlock(&ct->nat_resources_lock); + + struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now); + + struct nat_conn_key_node *nat_conn_key_node = + nat_conn_keys_lookup(&ct->nat_conn_keys, conn, ct->hash_basis); + + /* In the unlikely event, rev conn was recreated, then skip rev_conn cleanup. */ + if ((rev_conn) && (!nat_conn_key_node || + memcmp(&nat_conn_key_node->value, &rev_conn->key, + sizeof nat_conn_key_node->value))) { + hmap_remove(&ct->buckets[bucket_rev_conn].connections, + &rev_conn->node); + free(rev_conn); + } + delete_conn(conn); + + ct_rwlock_unlock(&ct->nat_resources_lock); + ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock); + ct_lock_lock(&ctb->lock); + +} + static struct conn * conn_not_found(struct conntrack *ct, struct dp_packet *pkt, struct conn_lookup_ctx *ctx, uint16_t *state, bool commit, - long long now) + long long now, const struct nat_action_info_t *nat_action_info, + struct conn *conn_for_un_nat_copy) { unsigned bucket = hash_to_bucket(ctx->hash); struct conn *nc = NULL; @@ -179,7 +374,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, *state |= CS_INVALID; return nc; } - *state |= CS_NEW; if (commit) { @@ -193,71 +387,209 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, } nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now); + ctx->conn = nc; + memcpy(&nc->rev_key, &nc->key, sizeof nc->rev_key); + conn_key_reverse(&nc->rev_key); - memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key); + if (nat_action_info && nat_action_info->nat_action & NAT_ACTION) { + nc->nat_info = xzalloc(sizeof *nat_action_info); + memcpy(nc->nat_info, nat_action_info, sizeof *nc->nat_info); + ct_rwlock_wrlock(&ct->nat_resources_lock); - conn_key_reverse(&nc->rev_key); + bool nat_res = nat_select_range_tuple(ct, nc, conn_for_un_nat_copy); + + if (!nat_res) { + free(nc->nat_info); + nc->nat_info = NULL; + free (nc); + ct_rwlock_unlock(&ct->nat_resources_lock); + return NULL; + } + + if (conn_for_un_nat_copy && nc->conn_type == CT_CONN_TYPE_DEFAULT) { + *nc = *conn_for_un_nat_copy; + conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT; + } + ct_rwlock_unlock(&ct->nat_resources_lock); + + nat_packet(pkt, nc, state); + } hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash); atomic_count_inc(&ct->n_conn); } - return nc; } -static struct conn * -process_one(struct conntrack *ct, struct dp_packet *pkt, - struct conn_lookup_ctx *ctx, uint16_t zone, - bool commit, long long now) +static bool +conn_update_state(struct conntrack *ct, struct dp_packet *pkt, + struct conn_lookup_ctx *ctx, uint16_t *state, + struct conn **conn, long long now, + unsigned bucket) + OVS_REQUIRES(ct->buckets[bucket].lock) { - unsigned bucket = hash_to_bucket(ctx->hash); - struct conn *conn = ctx->conn; - uint16_t state = 0; + bool create_new_conn = false; - if (conn) { - if (ctx->related) { - state |= CS_RELATED; + if (ctx->related) { + *state |= CS_RELATED; + if (ctx->reply) { + *state |= CS_REPLY_DIR; + } + } else { + enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket], + pkt, ctx->reply, now); + + switch (res) { + case CT_UPDATE_VALID: + *state |= CS_ESTABLISHED; if (ctx->reply) { - state |= CS_REPLY_DIR; + *state |= CS_REPLY_DIR; + } + break; + case CT_UPDATE_INVALID: + *state |= CS_INVALID; + break; + case CT_UPDATE_NEW: + ovs_list_remove(&(*conn)->exp_node); + hmap_remove(&ct->buckets[bucket].connections, &(*conn)->node); + atomic_count_dec(&ct->n_conn); + if ((*conn)->nat_info) { + nat_clean(ct, *conn, &ct->buckets[bucket]); + } else { + delete_conn(*conn); } + create_new_conn = true; + break; + default: + OVS_NOT_REACHED(); + } + } + return create_new_conn; +} + +static void +create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy, + long long now) +{ + struct conn *nc = xzalloc(sizeof *nc); + memcpy(nc, conn_for_un_nat_copy, sizeof *nc); + nc->key = conn_for_un_nat_copy->rev_key; + nc->rev_key = conn_for_un_nat_copy->key; + uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis); + unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash); + ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock); + ct_rwlock_rdlock(&ct->nat_resources_lock); + + struct conn *rev_conn = conn_lookup(ct, &nc->key, now); + struct nat_conn_key_node *nat_conn_key_node = + nat_conn_keys_lookup(&ct->nat_conn_keys, nc, ct->hash_basis); + if (nat_conn_key_node && !memcmp(&nat_conn_key_node->value, + &nc->rev_key, sizeof nat_conn_key_node->value) && !rev_conn) { + + hmap_insert(&ct->buckets[un_nat_conn_bucket].connections, &nc->node, + un_nat_hash); } else { - enum ct_update_res res; + free(nc); + } + ct_rwlock_unlock(&ct->nat_resources_lock); + ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock); +} + +static void +handle_nat(struct dp_packet *pkt, struct conn *conn, + uint16_t zone, uint16_t *state, bool reply) +{ + if ((conn->nat_info && conn->nat_info->nat_action & NAT_ACTION) && + (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) || + (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) && + zone != pkt->md.ct_zone))){ - res = conn_update(conn, &ct->buckets[bucket], pkt, - ctx->reply, now); + if(reply) { + un_nat_packet(pkt, conn, state); + } else { + nat_packet(pkt, conn, state); + } + } +} - switch (res) { - case CT_UPDATE_VALID: - state |= CS_ESTABLISHED; - if (ctx->reply) { - state |= CS_REPLY_DIR; - } - break; - case CT_UPDATE_INVALID: - state |= CS_INVALID; - break; - case CT_UPDATE_NEW: - ovs_list_remove(&conn->exp_node); - hmap_remove(&ct->buckets[bucket].connections, &conn->node); - atomic_count_dec(&ct->n_conn); - delete_conn(conn); - conn = conn_not_found(ct, pkt, ctx, &state, commit, now); - break; - default: - OVS_NOT_REACHED(); +static void +process_one(struct conntrack *ct, struct dp_packet *pkt, + struct conn_lookup_ctx *ctx, uint16_t zone, + bool commit, long long now, const uint32_t *setmark, + const struct ovs_key_ct_labels *setlabel, + const struct nat_action_info_t *nat_action_info) +{ + struct conn *conn; + uint16_t state = 0; + unsigned bucket = hash_to_bucket(ctx->hash); + ct_lock_lock(&ct->buckets[bucket].lock); + conn_key_lookup(&ct->buckets[bucket], ctx, now); + conn = ctx->conn; + struct conn conn_for_un_nat_copy; + memset(&conn_for_un_nat_copy, 0, sizeof conn_for_un_nat_copy); + + if (OVS_LIKELY(conn)) { + if (conn->conn_type == CT_CONN_TYPE_UN_NAT){ + ctx->reply = 1; + + struct conn_lookup_ctx ctx2; + ctx2.conn = NULL; + ctx2.key = conn->rev_key; + ctx2.hash = conn_key_hash(&conn->rev_key, ct->hash_basis); + + ct_lock_unlock(&ct->buckets[bucket].lock); + bucket = hash_to_bucket(ctx2.hash); + + ct_lock_lock(&ct->buckets[bucket].lock); + conn_key_lookup(&ct->buckets[bucket], &ctx2, now); + + if (ctx2.conn) { + conn = ctx2.conn; + } else { + /* It is a race condition where conn has timed out and removed + * between unlock of the rev_conn and lock of the forward conn; + * nothing to do. */ + ct_lock_unlock(&ct->buckets[bucket].lock); + return; } } + } + + bool create_new_conn = false; + if (OVS_LIKELY(conn)) { + create_new_conn = conn_update_state(ct, pkt, ctx, &state, &conn, + now, bucket); + if (nat_action_info->nat_action && !create_new_conn) { + handle_nat(pkt, conn, zone, &state, ctx->reply); + } } else { if (ctx->related) { state |= CS_INVALID; } else { - conn = conn_not_found(ct, pkt, ctx, &state, commit, now); + create_new_conn = true; } } + if (OVS_UNLIKELY(create_new_conn)) { + conn = conn_not_found(ct, pkt, ctx, &state, commit, + now, nat_action_info, &conn_for_un_nat_copy); + } + write_ct_md(pkt, state, zone, conn ? conn->mark : 0, conn ? conn->label : OVS_U128_ZERO); - return conn; + if (conn && setmark) { + set_mark(pkt, conn, setmark[0], setmark[1]); + } + + if (conn && setlabel) { + set_label(pkt, conn, &setlabel[0], &setlabel[1]); + } + + ct_lock_unlock(&ct->buckets[bucket].lock); + + if (conn_for_un_nat_copy.conn_type == CT_CONN_TYPE_UN_NAT) { + create_un_nat_conn(ct, &conn_for_un_nat_copy, now); + } } /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All @@ -274,7 +606,7 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, const uint32_t *setmark, const struct ovs_key_ct_labels *setlabel, const char *helper, - const struct nat_action_info_t *nat_action_info OVS_UNUSED) + const struct nat_action_info_t *nat_action_info) { struct dp_packet **pkts = pkt_batch->packets; size_t cnt = pkt_batch->count; @@ -324,27 +656,12 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, } for (i = 0; i < arrcnt; i++) { - struct conntrack_bucket *ctb = &ct->buckets[arr[i].bucket]; size_t j; - ct_lock_lock(&ctb->lock); - ULLONG_FOR_EACH_1(j, arr[i].maps) { - struct conn *conn; - - conn_key_lookup(ctb, &ctxs[j], now); - - conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now); - - if (conn && setmark) { - set_mark(pkts[j], conn, setmark[0], setmark[1]); - } - - if (conn && setlabel) { - set_label(pkts[j], conn, &setlabel[0], &setlabel[1]); - } + process_one(ct, pkts[j], &ctxs[j], zone, commit, + now, setmark, setlabel, nat_action_info); } - ct_lock_unlock(&ctb->lock); } return 0; @@ -373,6 +690,7 @@ set_label(struct dp_packet *pkt, struct conn *conn, | (pkt->md.ct_label.u64.hi & ~(m.u64.hi)); conn->label = pkt->md.ct_label; } + /* Delete the expired connections from 'ctb', up to 'limit'. Returns the * earliest expiration time among the remaining connections in 'ctb'. Returns @@ -390,20 +708,27 @@ sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now, for (i = 0; i < N_CT_TM; i++) { LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) { - if (!conn_expired(conn, now) || count >= limit) { - min_expiration = MIN(min_expiration, conn->expiration); - if (count >= limit) { - /* Do not check other lists. */ - COVERAGE_INC(conntrack_long_cleanup); - return min_expiration; + if (conn->conn_type == CT_CONN_TYPE_DEFAULT) { + if (!conn_expired(conn, now) || count >= limit) { + min_expiration = MIN(min_expiration, conn->expiration); + if (count >= limit) { + /* Do not check other lists. */ + COVERAGE_INC(conntrack_long_cleanup); + return min_expiration; + } + break; } - break; + ovs_list_remove(&conn->exp_node); + hmap_remove(&ctb->connections, &conn->node); + if (conn->nat_info) { + nat_clean(ct, conn, ctb); + } else { + delete_conn(conn); + } + + atomic_count_dec(&ct->n_conn); + count++; } - ovs_list_remove(&conn->exp_node); - hmap_remove(&ctb->connections, &conn->node); - atomic_count_dec(&ct->n_conn); - delete_conn(conn); - count++; } } @@ -774,7 +1099,6 @@ extract_l4_icmp(struct conn_key *key, const void *data, size_t size, return false; } - /* pf doesn't do this, but it seems a good idea */ if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) { return false; @@ -998,7 +1322,6 @@ conn_key_hash(const struct conn_key *key, uint32_t basis) hsrc = hdst = basis; - /* Hash the source and destination tuple */ for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) { hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]); hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]); @@ -1025,6 +1348,245 @@ conn_key_reverse(struct conn_key *key) key->dst = tmp; } +static uint32_t +nat_ipv6_addrs_delta(struct in6_addr *ipv6_aligned_min, struct in6_addr *ipv6_aligned_max) +{ + uint64_t diff = 0; + ovs_u128 addr6_128_min; + memcpy(&addr6_128_min.u64.hi, &ipv6_aligned_min->s6_addr32[0], + sizeof addr6_128_min.u64.hi); + memcpy(&addr6_128_min.u64.lo, &ipv6_aligned_min->s6_addr32[2], + sizeof addr6_128_min.u64.lo); + ovs_u128 addr6_128_max; + memcpy(&addr6_128_max.u64.hi, &ipv6_aligned_max->s6_addr32[0], + sizeof addr6_128_max.u64.hi); + memcpy(&addr6_128_max.u64.lo, &ipv6_aligned_max->s6_addr32[2], + sizeof addr6_128_max.u64.lo); + + if ((ntohll(addr6_128_min.u64.hi) == ntohll(addr6_128_max.u64.hi)) && + (ntohll(addr6_128_min.u64.lo) <= ntohll(addr6_128_max.u64.lo))){ + + diff = ntohll(addr6_128_max.u64.lo) - ntohll(addr6_128_min.u64.lo); + + } else if ((ntohll(addr6_128_min.u64.hi) + 1 == ntohll(addr6_128_max.u64.hi)) && + (ntohll(addr6_128_min.u64.lo) > ntohll(addr6_128_max.u64.lo))) { + diff = 0xffffffffffffffff - (ntohll(addr6_128_min.u64.lo) - + ntohll(addr6_128_max.u64.lo) - 1); + } else { + /* Limit address delta supported to 32 bits or 4 billion approximately. + * Possibly, this should be visible to the user through a datapath + * support check, however the practical impact is probably nil. */ + diff = 0xffffffff; + } + return (uint32_t)diff; +} + +/* This function must be used in tandem with nat_ipv6_addrs_delta(), which + * restricts the input parameters. */ +static void +nat_ipv6_addr_increment(struct in6_addr *ipv6_aligned, uint32_t increment) +{ + ovs_u128 addr6_128; + memcpy(&addr6_128.u64.hi, &ipv6_aligned->s6_addr32[0], sizeof addr6_128.u64.hi); + memcpy(&addr6_128.u64.lo, &ipv6_aligned->s6_addr32[2], sizeof addr6_128.u64.lo); + + if (0xffffffffffffffff - increment >= ntohll(addr6_128.u64.lo)) { + addr6_128.u64.lo = htonll(increment + ntohll(addr6_128.u64.lo)); + } else if (addr6_128.u64.hi != 0xffffffffffffffff) { + addr6_128.u64.hi = htonll(1 + ntohll(addr6_128.u64.hi)); + addr6_128.u64.lo = + htonll(increment - (0xffffffffffffffff - + ntohll(addr6_128.u64.lo) + 1)); + } else { + OVS_NOT_REACHED(); + } + + memcpy(&ipv6_aligned->s6_addr32[0], &addr6_128.u64.hi, sizeof addr6_128.u64.hi); + memcpy(&ipv6_aligned->s6_addr32[2], &addr6_128.u64.lo, sizeof addr6_128.u64.lo); + + return; +} + +static uint32_t +nat_range_hash(const struct conn *conn, uint32_t basis) +{ + uint32_t hash = basis; + int i; + uint32_t port; + + for (i = 0; + i < sizeof(conn->nat_info->min_addr) / sizeof(uint32_t); + i++) { + hash = hash_add(hash, ((uint32_t *) &conn->nat_info->min_addr)[i]); + hash = hash_add(hash, ((uint32_t *) &conn->nat_info->max_addr)[i]); + } + + memcpy(&port, &conn->nat_info->min_port, sizeof port); + hash = hash_add(hash, port); + + uint32_t dl_type_for_hash = (uint32_t) conn->key.dl_type; + hash = hash_add(hash, dl_type_for_hash); + uint32_t nw_proto_for_hash = (uint32_t) conn->key.nw_proto; + hash = hash_add(hash, nw_proto_for_hash); + uint32_t zone_for_hash = (uint32_t) conn->key.zone; + hash = hash_add(hash, zone_for_hash); + return hash; +} + +static bool +nat_select_range_tuple(struct conntrack *ct, const struct conn *conn, + struct conn *rev_conn) +{ +#define MIN_NAT_EPHEMERAL_PORT 1024 +#define MAX_NAT_EPHEMERAL_PORT 65535 + + uint16_t min_port; + uint16_t max_port; + uint16_t first_port; + + uint32_t hash = nat_range_hash(conn, ct->hash_basis); + + if ((conn->nat_info->nat_action & NAT_ACTION_SRC) && + (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) { + min_port = ntohs(conn->key.src.port); + max_port = ntohs(conn->key.src.port); + first_port = min_port; + } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) && + (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) { + min_port = ntohs(conn->key.dst.port); + max_port = ntohs(conn->key.dst.port); + first_port = min_port; + } else { + uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port; + uint32_t port_index = hash % (deltap + 1); + first_port = conn->nat_info->min_port + port_index; + min_port = conn->nat_info->min_port; + max_port = conn->nat_info->max_port; + } + + uint32_t deltaa = 0; + uint32_t address_index; + struct ct_addr ct_addr; + memset(&ct_addr, 0, sizeof ct_addr); + struct ct_addr max_ct_addr; + memset(&max_ct_addr, 0, sizeof max_ct_addr); + max_ct_addr = conn->nat_info->max_addr; + + if (conn->key.dl_type == htons(ETH_TYPE_IP)) { + deltaa = ntohl(conn->nat_info->max_addr.ipv4_aligned) - + ntohl(conn->nat_info->min_addr.ipv4_aligned); + address_index = hash % (deltaa + 1); + ct_addr.ipv4_aligned = htonl(ntohl(conn->nat_info->min_addr.ipv4_aligned) + + address_index); + } else { + deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6_aligned, + &conn->nat_info->max_addr.ipv6_aligned); + /* deltaa must be within 32 bits for full hash coverage. A 64 or 128 bit hash is + * unnecessary and hence not used here. Most code is kept common with V4; + * nat_ipv6_addrs_delta() will do the enforcement via max_ct_addr. */ + max_ct_addr = conn->nat_info->min_addr; + nat_ipv6_addr_increment(&max_ct_addr.ipv6_aligned, deltaa); + + address_index = hash % (deltaa + 1); + ct_addr.ipv6_aligned = conn->nat_info->min_addr.ipv6_aligned; + nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, address_index); + } + + uint16_t port = first_port; + bool all_ports_tried = false; + bool original_ports_tried = false; + struct ct_addr first_addr = ct_addr; + *rev_conn = *conn; + while (true) { + if (conn->nat_info->nat_action & NAT_ACTION_SRC) { + rev_conn->rev_key.dst.addr = ct_addr; + rev_conn->rev_key.dst.port = htons(port); + } else { + rev_conn->rev_key.src.addr = ct_addr; + rev_conn->rev_key.src.port = htons(port); + } + struct nat_conn_key_node *nat_conn_key_node = + nat_conn_keys_lookup(&ct->nat_conn_keys, rev_conn, ct->hash_basis); + + if (!nat_conn_key_node) { + struct nat_conn_key_node *nat_conn_key = xzalloc(sizeof *nat_conn_key); + memcpy(&nat_conn_key->key, &rev_conn->rev_key, sizeof nat_conn_key->key); + memcpy(&nat_conn_key->value, &rev_conn->key, sizeof nat_conn_key->value); + uint32_t nat_conn_key_hash = conn_key_hash(&nat_conn_key->key, ct->hash_basis); + hmap_insert(&ct->nat_conn_keys, &nat_conn_key->node, nat_conn_key_hash); + return true; + } else if (!all_ports_tried) { + if (min_port == max_port) { + all_ports_tried = true; + } else if(port == max_port) { + port = min_port; + } else { + port++; + } + if (port == first_port) { + all_ports_tried = true; + } + } else { + if(memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) { + if (conn->key.dl_type == htons(ETH_TYPE_IP)) { + ct_addr.ipv4_aligned = htonl(ntohl(ct_addr.ipv4_aligned) + 1); + } else { + nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, 1); + } + } else { + ct_addr = conn->nat_info->min_addr; + } + if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) { + if (!original_ports_tried) { + original_ports_tried = true; + ct_addr = conn->nat_info->min_addr; + min_port = MIN_NAT_EPHEMERAL_PORT; + max_port = MAX_NAT_EPHEMERAL_PORT; + } else { + break; + } + } + first_port = min_port; + port = first_port; + all_ports_tried = false; + } + } + return false; +} + +static struct nat_conn_key_node * +nat_conn_keys_lookup(struct hmap *nat_conn_keys, + const struct conn *conn, + uint32_t basis) +{ + struct nat_conn_key_node *nat_conn_key_node; + uint32_t nat_conn_key_hash = conn_key_hash(&conn->key, basis); + + HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash, nat_conn_keys) { + if (!memcmp(&nat_conn_key_node->key, &conn->key, sizeof nat_conn_key_node->key)) { + return nat_conn_key_node; + } + } + return NULL; +} + +static void +nat_conn_keys_remove(struct hmap *nat_conn_keys, + const struct conn *conn, + uint32_t basis) +{ + struct nat_conn_key_node *nat_conn_key_node; + uint32_t nat_conn_key_hash = conn_key_hash(&conn->key, basis); + + HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash, nat_conn_keys) { + if (!memcmp(&nat_conn_key_node->key, &conn->key, sizeof nat_conn_key_node->key)) { + hmap_remove(nat_conn_keys, &nat_conn_key_node->node); + free(nat_conn_key_node); + return; + } + } +} + static void conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx, @@ -1036,13 +1598,13 @@ conn_key_lookup(struct conntrack_bucket *ctb, ctx->conn = NULL; HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) { - if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key)) + if (!memcmp(&conn->key, &ctx->key, sizeof conn->key) && !conn_expired(conn, now)) { ctx->conn = conn; ctx->reply = false; break; } - if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key)) + if (!memcmp(&conn->rev_key, &ctx->key, sizeof conn->rev_key) && !conn_expired(conn, now)) { ctx->conn = conn; ctx->reply = true; @@ -1062,7 +1624,10 @@ conn_update(struct conn *conn, struct conntrack_bucket *ctb, static bool conn_expired(struct conn *conn, long long now) { - return now >= conn->expiration; + if (conn->conn_type == CT_CONN_TYPE_DEFAULT) { + return now >= conn->expiration; + } + return false; } static bool @@ -1089,6 +1654,7 @@ new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt, static void delete_conn(struct conn *conn) { + free(conn->nat_info); free(conn); } @@ -1141,7 +1707,7 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry, entry->zone = conn->key.zone; entry->mark = conn->mark; - memcpy(&entry->labels, &conn->label, sizeof(entry->labels)); + memcpy(&entry->labels, &conn->label, sizeof entry->labels); /* Not implemented yet */ entry->timestamp.start = 0; entry->timestamp.stop = 0; @@ -1188,7 +1754,8 @@ conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry) break; } INIT_CONTAINER(conn, node, node); - if (!dump->filter_zone || conn->key.zone == dump->zone) { + if ((!dump->filter_zone || conn->key.zone == dump->zone) && + (conn->conn_type != CT_CONN_TYPE_UN_NAT)){ conn_to_ct_dpif_entry(conn, entry, now); break; } @@ -1224,14 +1791,21 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone) ct_lock_lock(&ct->buckets[i].lock); HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) { if (!zone || *zone == conn->key.zone) { - ovs_list_remove(&conn->exp_node); + if (conn->conn_type == CT_CONN_TYPE_DEFAULT) { + ovs_list_remove(&conn->exp_node); + } hmap_remove(&ct->buckets[i].connections, &conn->node); - atomic_count_dec(&ct->n_conn); - delete_conn(conn); + if (conn->conn_type == CT_CONN_TYPE_DEFAULT) { + atomic_count_dec(&ct->n_conn); + } + if (conn->nat_info) { + nat_clean(ct, conn, &ct->buckets[i]); + } else { + delete_conn(conn); + } } } ct_lock_unlock(&ct->buckets[i].lock); } - return 0; } diff --git a/lib/conntrack.h b/lib/conntrack.h index cbdfb91..5174123 100644 --- a/lib/conntrack.h +++ b/lib/conntrack.h @@ -123,6 +123,10 @@ struct OVS_LOCKABLE ct_lock { struct ovs_mutex lock; }; +struct OVS_LOCKABLE ct_rwlock { + struct ovs_rwlock lock; +}; + static inline void ct_lock_init(struct ct_lock *lock) { ovs_mutex_init_adaptive(&lock->lock); @@ -146,6 +150,38 @@ static inline void ct_lock_destroy(struct ct_lock *lock) { ovs_mutex_destroy(&lock->lock); } + +static inline void ct_rwlock_init(struct ct_rwlock *lock) +{ + ovs_rwlock_init(&lock->lock); +} + +static inline void ct_rwlock_wrlock(struct ct_rwlock *lock) + OVS_ACQUIRES(lock) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + ovs_rwlock_wrlock(&lock->lock); +} + +static inline void ct_rwlock_rdlock(struct ct_rwlock *lock) + OVS_ACQUIRES(lock) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + ovs_rwlock_rdlock(&lock->lock); +} + +static inline void ct_rwlock_unlock(struct ct_rwlock *lock) + OVS_RELEASES(lock) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + ovs_rwlock_unlock(&lock->lock); +} + +static inline void ct_rwlock_destroy(struct ct_rwlock *lock) +{ + ovs_rwlock_destroy(&lock->lock); +} + /* Timeouts: all the possible timeout states passed to update_expiration() * are listed here. The name will be prefix by CT_TM_ and the value is in @@ -228,6 +264,12 @@ struct conntrack { /* Connections limit. When this limit is reached, no new connection * will be accepted. */ atomic_uint n_conn_limit; + + /* The following resources are referenced during nat connection + * creation and deletion. */ + struct hmap nat_conn_keys OVS_GUARDED; + struct ct_rwlock nat_resources_lock; + }; #endif /* conntrack.h */ -- 1.9.1 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev