Paolo Valerio <[email protected]> writes:
> From: hepeng <[email protected]>
>
> The patch avoids the extra allocation for nat_conn.
> Currently, when doing NAT, the userspace conntrack will use an extra
> conn for the two directions in a flow. However, each conn has actually
> the two keys for both orig and rev directions. This patch introduces a
> key_node[CT_DIRS] member in the conn which consists of a key, direction,
> and a cmap_node for hash lookup so addressing the feedback received by
> the original patch [0].
>
> The patch is an alternative approach to [1].
> The patch has the advantage of solving the issue in a clean way, but,
> unlike [1], it has the disadvantage of requiring some changes to the
> connection clean up for older branches (down to 2.17) and all the
> related operations. To make an idea, [0] contains most of the changes
> required.
>
> [0]
> https://patchwork.ozlabs.org/project/openvswitch/patch/[email protected]/
> [1]
> https://patchwork.ozlabs.org/project/openvswitch/list/?series=351579&state=*
>
> Signed-off-by: Peng He <[email protected]>
> Co-authored-by: Paolo Valerio <[email protected]>
> Signed-off-by: Paolo Valerio <[email protected]>
> ---
Thanks for the RFC - I actually favor this approach for a number of
reasons, and the implementation you provide here looks great on the
patch stats line. Just a couple of nits below. I also tried it out
under ASAN and UBSAN and didn't see any new issues pop up, so I'm happy
with this RFC.
> lib/conntrack-private.h | 19 ++-
> lib/conntrack-tp.c | 6 +
> lib/conntrack.c | 339
> +++++++++++++++++++----------------------------
> 3 files changed, 149 insertions(+), 215 deletions(-)
>
> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
> index bb326868e..3fd5fccd3 100644
> --- a/lib/conntrack-private.h
> +++ b/lib/conntrack-private.h
> @@ -49,6 +49,12 @@ struct ct_endpoint {
> * hashing in ct_endpoint_hash_add(). */
> BUILD_ASSERT_DECL(sizeof(struct ct_endpoint) == sizeof(union ct_addr) + 4);
>
> +enum key_dir {
> + CT_DIR_FWD = 0,
> + CT_DIR_REV,
> + CT_DIRS,
> +};
> +
> /* Changes to this structure need to be reflected in conn_key_hash()
> * and conn_key_cmp(). */
> struct conn_key {
> @@ -112,20 +118,18 @@ enum ct_timeout {
>
> #define N_EXP_LISTS 100
>
> -enum OVS_PACKED_ENUM ct_conn_type {
> - CT_CONN_TYPE_DEFAULT,
> - CT_CONN_TYPE_UN_NAT,
> +struct conn_key_node {
> + enum key_dir dir;
> + struct conn_key key;
> + struct cmap_node cm_node;
> };
>
> struct conn {
> /* Immutable data. */
> - struct conn_key key;
> - struct conn_key rev_key;
> + struct conn_key_node key_node[CT_DIRS];
> struct conn_key parent_key; /* Only used for orig_tuple support. */
> - struct cmap_node cm_node;
> uint16_t nat_action;
> char *alg;
> - struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */
> atomic_flag reclaimed; /* False during the lifetime of the connection,
> * True as soon as a thread has started freeing
> * its memory. */
> @@ -150,7 +154,6 @@ struct conn {
>
> /* Immutable data. */
> bool alg_related; /* True if alg data connection. */
> - enum ct_conn_type conn_type;
>
> uint32_t tp_id; /* Timeout policy ID. */
> };
> diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
> index 89cb2704a..2149fdc73 100644
> --- a/lib/conntrack-tp.c
> +++ b/lib/conntrack-tp.c
> @@ -253,7 +253,8 @@ conn_update_expiration(struct conntrack *ct, struct conn
> *conn,
> }
> VLOG_DBG_RL(&rl, "Update timeout %s zone=%u with policy id=%d "
> "val=%u sec.",
> - ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
> + ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone,
> + conn->tp_id, val);
>
> atomic_store_relaxed(&conn->expiration, now + val * 1000);
> }
> @@ -273,7 +274,8 @@ conn_init_expiration(struct conntrack *ct, struct conn
> *conn,
> }
>
> VLOG_DBG_RL(&rl, "Init timeout %s zone=%u with policy id=%d val=%u sec.",
> - ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
> + ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone,
> + conn->tp_id, val);
>
> conn->expiration = now + val * 1000;
> }
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 5f1176d33..6f219eb9e 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -113,8 +113,7 @@ static void set_label(struct dp_packet *, struct conn *,
> static void *clean_thread_main(void *f_);
>
> static bool
> -nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
> - struct conn *nat_conn,
> +nat_get_unique_tuple(struct conntrack *ct, struct conn *conn,
> const struct nat_action_info_t *nat_info);
>
> static uint8_t
> @@ -234,61 +233,6 @@ conn_key_cmp(const struct conn_key *key1, const struct
> conn_key *key2)
> return 1;
> }
>
> -static void
> -ct_print_conn_info(const struct conn *c, const char *log_msg,
> - enum vlog_level vll, bool force, bool rl_on)
> -{
> -#define CT_VLOG(RL_ON, LEVEL, ...) \
> - do { \
> - if (RL_ON) { \
> - static struct vlog_rate_limit rl_ = VLOG_RATE_LIMIT_INIT(5, 5); \
> - vlog_rate_limit(&this_module, LEVEL, &rl_, __VA_ARGS__); \
> - } else { \
> - vlog(&this_module, LEVEL, __VA_ARGS__); \
> - } \
> - } while (0)
> -
> - if (OVS_UNLIKELY(force || vlog_is_enabled(&this_module, vll))) {
> - if (c->key.dl_type == htons(ETH_TYPE_IP)) {
> - CT_VLOG(rl_on, vll, "%s: src ip "IP_FMT" dst ip "IP_FMT" rev src
> "
> - "ip "IP_FMT" rev dst ip "IP_FMT" src/dst ports "
> - "%"PRIu16"/%"PRIu16" rev src/dst ports "
> - "%"PRIu16"/%"PRIu16" zone/rev zone "
> - "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
> - "%"PRIu8"/%"PRIu8, log_msg,
> - IP_ARGS(c->key.src.addr.ipv4),
> - IP_ARGS(c->key.dst.addr.ipv4),
> - IP_ARGS(c->rev_key.src.addr.ipv4),
> - IP_ARGS(c->rev_key.dst.addr.ipv4),
> - ntohs(c->key.src.port), ntohs(c->key.dst.port),
> - ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
> - c->key.zone, c->rev_key.zone, c->key.nw_proto,
> - c->rev_key.nw_proto);
> - } else {
> - char ip6_s[INET6_ADDRSTRLEN];
> - inet_ntop(AF_INET6, &c->key.src.addr.ipv6, ip6_s, sizeof ip6_s);
> - char ip6_d[INET6_ADDRSTRLEN];
> - inet_ntop(AF_INET6, &c->key.dst.addr.ipv6, ip6_d, sizeof ip6_d);
> - char ip6_rs[INET6_ADDRSTRLEN];
> - inet_ntop(AF_INET6, &c->rev_key.src.addr.ipv6, ip6_rs,
> - sizeof ip6_rs);
> - char ip6_rd[INET6_ADDRSTRLEN];
> - inet_ntop(AF_INET6, &c->rev_key.dst.addr.ipv6, ip6_rd,
> - sizeof ip6_rd);
> -
> - CT_VLOG(rl_on, vll, "%s: src ip %s dst ip %s rev src ip %s"
> - " rev dst ip %s src/dst ports %"PRIu16"/%"PRIu16
> - " rev src/dst ports %"PRIu16"/%"PRIu16" zone/rev zone "
> - "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
> - "%"PRIu8"/%"PRIu8, log_msg, ip6_s, ip6_d, ip6_rs,
> - ip6_rd, ntohs(c->key.src.port), ntohs(c->key.dst.port),
> - ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
> - c->key.zone, c->rev_key.zone, c->key.nw_proto,
> - c->rev_key.nw_proto);
> - }
> - }
> -}
> -
> /* Initializes the connection tracker 'ct'. The caller is responsible for
> * calling 'conntrack_destroy()', when the instance is not needed anymore */
> struct conntrack *
> @@ -477,15 +421,16 @@ conn_clean__(struct conntrack *ct, struct conn *conn)
> uint32_t hash;
>
> if (conn->alg) {
> - expectation_clean(ct, &conn->key);
> + expectation_clean(ct, &conn->key_node[CT_DIR_FWD].key);
> }
>
> - hash = conn_key_hash(&conn->key, ct->hash_basis);
> - cmap_remove(&ct->conns, &conn->cm_node, hash);
> + hash = conn_key_hash(&conn->key_node[CT_DIR_FWD].key, ct->hash_basis);
> + cmap_remove(&ct->conns, &conn->key_node[CT_DIR_FWD].cm_node, hash);
>
> - if (conn->nat_conn) {
> - hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis);
> - cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash);
> + if (conn->nat_action) {
> + hash = conn_key_hash(&conn->key_node[CT_DIR_REV].key,
> + ct->hash_basis);
> + cmap_remove(&ct->conns, &conn->key_node[CT_DIR_REV].cm_node, hash);
> }
>
> rculist_remove(&conn->node);
> @@ -497,8 +442,6 @@ static void
> conn_clean(struct conntrack *ct, struct conn *conn)
> OVS_EXCLUDED(conn->lock, ct->ct_lock)
> {
> - ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
> -
> if (atomic_flag_test_and_set(&conn->reclaimed)) {
> return;
> }
> @@ -585,34 +528,34 @@ conn_key_lookup(struct conntrack *ct, const struct
> conn_key *key,
> uint32_t hash, long long now, struct conn **conn_out,
> bool *reply)
> {
> - struct conn *conn;
> + struct conn_key_node *keyn;
> + struct conn *conn = NULL;
> bool found = false;
>
> - CMAP_FOR_EACH_WITH_HASH (conn, cm_node, hash, &ct->conns) {
> + CMAP_FOR_EACH_WITH_HASH (keyn, cm_node, hash, &ct->conns) {
> + conn = CONTAINER_OF(keyn, struct conn, key_node[keyn->dir]);
> if (conn_expired(conn, now)) {
> continue;
> }
> - if (!conn_key_cmp(&conn->key, key)) {
> - found = true;
> - if (reply) {
> - *reply = false;
> - }
> - break;
> - }
> - if (!conn_key_cmp(&conn->rev_key, key)) {
> - found = true;
> - if (reply) {
> - *reply = true;
> +
> + for (int i = CT_DIR_FWD; i < CT_DIRS; i++) {
> + if (!conn_key_cmp(&conn->key_node[i].key, key)) {
> + found = true;
> + if (reply) {
> + *reply = i;
> + }
> + goto out_found;
> }
> - break;
> }
> }
>
> +out_found:
> if (found && conn_out) {
> *conn_out = conn;
> } else if (conn_out) {
> *conn_out = NULL;
> }
> +
> return found;
> }
>
> @@ -646,7 +589,7 @@ write_ct_md(struct dp_packet *pkt, uint16_t zone, const
> struct conn *conn,
> if (conn->alg_related) {
> key = &conn->parent_key;
> } else {
> - key = &conn->key;
> + key = &conn->key_node[CT_DIR_FWD].key;
> }
> } else if (alg_exp) {
> pkt->md.ct_mark = alg_exp->parent_mark;
> @@ -877,7 +820,7 @@ nat_inner_packet(struct dp_packet *pkt, struct conn_key
> *key,
> static void
> nat_packet(struct dp_packet *pkt, struct conn *conn, bool reply, bool
> related)
> {
> - struct conn_key *key = reply ? &conn->key : &conn->rev_key;
> + struct conn_key *key = &conn->key_node[!reply].key;
I don't like this particular style here. I recognize that gcc using
stdbool will do the right thing, but I'm less sure about other compilers
not setting !0 == 0xffffffff or something like that. I would prefer to
see a test where we get the enum value and trust the optimizer to do the
right thing. Actually, I see we have just a couple of places where we
do this check (and we probably should think about cleaning them up since
the coding style says we should not depend on the explicit values).
> uint16_t nat_action = reply ? nat_action_reverse(conn->nat_action)
> : conn->nat_action;
>
> @@ -911,7 +854,7 @@ conn_seq_skew_set(struct conntrack *ct, const struct conn
> *conn_in,
> {
> struct conn *conn;
>
> - conn_lookup(ct, &conn_in->key, now, &conn, NULL);
> + conn_lookup(ct, &conn_in->key_node[CT_DIR_FWD].key, now, &conn, NULL);
> if (conn && seq_skew) {
> conn->seq_skew = seq_skew;
> conn->seq_skew_dir = seq_skew_dir;
> @@ -947,7 +890,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet
> *pkt,
> OVS_REQUIRES(ct->ct_lock)
> {
> struct conn *nc = NULL;
> - struct conn *nat_conn = NULL;
>
> if (!valid_new(pkt, &ctx->key)) {
> pkt->md.ct_state = CS_INVALID;
> @@ -961,6 +903,7 @@ conn_not_found(struct conntrack *ct, struct dp_packet
> *pkt,
> }
>
> if (commit) {
> + struct conn_key_node *fwd_key_node, *rev_key_node;
> struct zone_limit *zl = zone_limit_lookup_or_default(ct,
> ctx->key.zone);
> if (zl && atomic_count_get(&zl->czl.count) >= zl->czl.limit) {
> @@ -975,9 +918,12 @@ conn_not_found(struct conntrack *ct, struct dp_packet
> *pkt,
> }
>
> nc = new_conn(ct, pkt, &ctx->key, now, tp_id);
> - memcpy(&nc->key, &ctx->key, sizeof nc->key);
> - memcpy(&nc->rev_key, &nc->key, sizeof nc->rev_key);
> - conn_key_reverse(&nc->rev_key);
> + fwd_key_node = &nc->key_node[CT_DIR_FWD];
> + rev_key_node = &nc->key_node[CT_DIR_REV];
> + memcpy(&fwd_key_node->key, &ctx->key, sizeof fwd_key_node->key);
> + memcpy(&rev_key_node->key, &fwd_key_node->key,
> + sizeof rev_key_node->key);
> + conn_key_reverse(&rev_key_node->key);
>
> if (ct_verify_helper(helper, ct_alg_ctl)) {
> nc->alg = nullable_xstrdup(helper);
> @@ -992,46 +938,33 @@ conn_not_found(struct conntrack *ct, struct dp_packet
> *pkt,
>
> if (nat_action_info) {
> nc->nat_action = nat_action_info->nat_action;
> - nat_conn = xzalloc(sizeof *nat_conn);
>
> if (alg_exp) {
> if (alg_exp->nat_rpl_dst) {
> - nc->rev_key.dst.addr = alg_exp->alg_nat_repl_addr;
> + rev_key_node->key.dst.addr = alg_exp->alg_nat_repl_addr;
> nc->nat_action = NAT_ACTION_SRC;
> } else {
> - nc->rev_key.src.addr = alg_exp->alg_nat_repl_addr;
> + rev_key_node->key.src.addr = alg_exp->alg_nat_repl_addr;
> nc->nat_action = NAT_ACTION_DST;
> }
> } else {
> - memcpy(nat_conn, nc, sizeof *nat_conn);
> - bool nat_res = nat_get_unique_tuple(ct, nc, nat_conn,
> - nat_action_info);
> -
> + bool nat_res = nat_get_unique_tuple(ct, nc, nat_action_info);
> if (!nat_res) {
> goto nat_res_exhaustion;
> }
> -
> - /* Update nc with nat adjustments made to nat_conn by
> - * nat_get_unique_tuple(). */
> - memcpy(nc, nat_conn, sizeof *nc);
> }
>
> nat_packet(pkt, nc, false, ctx->icmp_related);
> - memcpy(&nat_conn->key, &nc->rev_key, sizeof nat_conn->key);
> - memcpy(&nat_conn->rev_key, &nc->key, sizeof nat_conn->rev_key);
> - nat_conn->conn_type = CT_CONN_TYPE_UN_NAT;
> - nat_conn->nat_action = 0;
> - nat_conn->alg = NULL;
> - nat_conn->nat_conn = NULL;
> - uint32_t nat_hash = conn_key_hash(&nat_conn->key,
> ct->hash_basis);
> - cmap_insert(&ct->conns, &nat_conn->cm_node, nat_hash);
> + uint32_t rev_hash = conn_key_hash(&rev_key_node->key,
> + ct->hash_basis);
> + cmap_insert(&ct->conns, &rev_key_node->cm_node, rev_hash);
> }
>
> - nc->nat_conn = nat_conn;
> ovs_mutex_init_adaptive(&nc->lock);
> - nc->conn_type = CT_CONN_TYPE_DEFAULT;
> atomic_flag_clear(&nc->reclaimed);
> - cmap_insert(&ct->conns, &nc->cm_node, ctx->hash);
> + fwd_key_node->dir = CT_DIR_FWD;
> + rev_key_node->dir = CT_DIR_REV;
> + cmap_insert(&ct->conns, &fwd_key_node->cm_node, ctx->hash);
> conn_expire_push_front(ct, nc);
> atomic_count_inc(&ct->n_conn);
> ctx->conn = nc; /* For completeness. */
> @@ -1052,7 +985,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet
> *pkt,
> * firewall rules or a separate firewall. Also using zone partitioning
> * can limit DoS impact. */
> nat_res_exhaustion:
> - free(nat_conn);
> delete_conn__(nc);
> static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
> VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
> @@ -1065,7 +997,6 @@ conn_update_state(struct conntrack *ct, struct dp_packet
> *pkt,
> struct conn_lookup_ctx *ctx, struct conn *conn,
> long long now)
> {
> - ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
> bool create_new_conn = false;
>
> if (ctx->icmp_related) {
> @@ -1092,7 +1023,8 @@ conn_update_state(struct conntrack *ct, struct
> dp_packet *pkt,
> pkt->md.ct_state = CS_INVALID;
> break;
> case CT_UPDATE_NEW:
> - if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
> + if (conn_lookup(ct, &conn->key_node[CT_DIR_FWD].key,
> + now, NULL, NULL)) {
> conn_force_expire(conn);
> }
> create_new_conn = true;
> @@ -1269,7 +1201,7 @@ initial_conn_lookup(struct conntrack *ct, struct
> conn_lookup_ctx *ctx,
> if (natted) {
> if (OVS_LIKELY(ctx->conn)) {
> ctx->reply = !ctx->reply;
> - ctx->key = ctx->reply ? ctx->conn->rev_key : ctx->conn->key;
> + ctx->key = ctx->conn->key_node[ctx->reply].key;
See above for similar reasoning about 1, 0, and bool types. I think it
might be safer here since _Bool in c99 should just hold the values
true/false, and c99 will be consistent about it.
> ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
> } else {
> /* A lookup failure does not necessarily imply that an
> @@ -1302,31 +1234,13 @@ process_one(struct conntrack *ct, struct dp_packet
> *pkt,
>
> /* Delete found entry if in wrong direction. 'force' implies commit. */
> if (OVS_UNLIKELY(force && ctx->reply && conn)) {
> - if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
> + if (conn_lookup(ct, &conn->key_node[CT_DIR_FWD].key,
> + now, NULL, NULL)) {
> conn_force_expire(conn);
> }
> conn = NULL;
> }
>
> - if (OVS_LIKELY(conn)) {
> - if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
> -
> - ctx->reply = true;
> - struct conn *rev_conn = conn; /* Save for debugging. */
> - uint32_t hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
> - conn_key_lookup(ct, &ctx->key, hash, now, &conn, &ctx->reply);
> -
> - if (!conn) {
> - pkt->md.ct_state |= CS_INVALID;
> - write_ct_md(pkt, zone, NULL, NULL, NULL);
> - char *log_msg = xasprintf("Missing parent conn %p",
> rev_conn);
> - ct_print_conn_info(rev_conn, log_msg, VLL_INFO, true, true);
> - free(log_msg);
> - return;
> - }
> - }
> - }
> -
> enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, tp_src, tp_dst,
> helper);
>
> @@ -1419,8 +1333,9 @@ conntrack_execute(struct conntrack *ct, struct
> dp_packet_batch *pkt_batch,
> struct conn *conn = packet->md.conn;
> if (OVS_UNLIKELY(packet->md.ct_state == CS_INVALID)) {
> write_ct_md(packet, zone, NULL, NULL, NULL);
> - } else if (conn && conn->key.zone == zone && !force
> - && !get_alg_ctl_type(packet, tp_src, tp_dst, helper)) {
> + } else if (conn &&
> + conn->key_node[CT_DIR_FWD].key.zone == zone && !force &&
> + !get_alg_ctl_type(packet, tp_src, tp_dst, helper)) {
> process_one_fast(zone, setmark, setlabel, nat_action_info,
> conn, packet);
> } else if (OVS_UNLIKELY(!conn_key_extract(ct, packet, dl_type, &ctx,
> @@ -2269,7 +2184,7 @@ nat_ipv6_addr_increment(struct in6_addr *ipv6, uint32_t
> increment)
> }
>
> static uint32_t
> -nat_range_hash(const struct conn *conn, uint32_t basis,
> +nat_range_hash(const struct conn_key *key, uint32_t basis,
> const struct nat_action_info_t *nat_info)
> {
> uint32_t hash = basis;
> @@ -2279,11 +2194,11 @@ nat_range_hash(const struct conn *conn, uint32_t
> basis,
> hash = hash_add(hash,
> ((uint32_t) nat_info->max_port << 16)
> | nat_info->min_port);
> - hash = ct_endpoint_hash_add(hash, &conn->key.src);
> - hash = ct_endpoint_hash_add(hash, &conn->key.dst);
> - hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type);
> - hash = hash_add(hash, conn->key.nw_proto);
> - hash = hash_add(hash, conn->key.zone);
> + hash = ct_endpoint_hash_add(hash, &key->src);
> + hash = ct_endpoint_hash_add(hash, &key->dst);
> + hash = hash_add(hash, (OVS_FORCE uint32_t) key->dl_type);
> + hash = hash_add(hash, key->nw_proto);
> + hash = hash_add(hash, key->zone);
>
> /* The purpose of the second parameter is to distinguish hashes of data
> of
> * different length; our data always has the same length so there is no
> @@ -2357,7 +2272,7 @@ get_addr_in_range(union ct_addr *min, union ct_addr
> *max,
> }
>
> static void
> -find_addr(const struct conn *conn, union ct_addr *min,
> +find_addr(const struct conn_key *key, union ct_addr *min,
> union ct_addr *max, union ct_addr *curr,
> uint32_t hash, bool ipv4,
> const struct nat_action_info_t *nat_info)
> @@ -2367,9 +2282,9 @@ find_addr(const struct conn *conn, union ct_addr *min,
> /* All-zero case. */
> if (!memcmp(min, &zero_ip, sizeof *min)) {
> if (nat_info->nat_action & NAT_ACTION_SRC) {
> - *curr = conn->key.src.addr;
> + *curr = key->src.addr;
> } else if (nat_info->nat_action & NAT_ACTION_DST) {
> - *curr = conn->key.dst.addr;
> + *curr = key->dst.addr;
> }
> } else {
> get_addr_in_range(min, max, curr, hash, ipv4);
> @@ -2388,7 +2303,7 @@ store_addr_to_key(union ct_addr *addr, struct conn_key
> *key,
> }
>
> static bool
> -nat_get_unique_l4(struct conntrack *ct, struct conn *nat_conn,
> +nat_get_unique_l4(struct conntrack *ct, struct conn_key *rev_key,
> ovs_be16 *port, uint16_t curr, uint16_t min,
> uint16_t max)
> {
> @@ -2411,7 +2326,7 @@ another_round:
> }
>
> *port = htons(curr);
> - if (!conn_lookup(ct, &nat_conn->rev_key,
> + if (!conn_lookup(ct, rev_key,
> time_msec(), NULL, NULL)) {
> return true;
> }
> @@ -2450,39 +2365,41 @@ another_round:
> *
> * If none can be found, return exhaustion to the caller. */
> static bool
> -nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
> - struct conn *nat_conn,
> +nat_get_unique_tuple(struct conntrack *ct, struct conn *conn,
> const struct nat_action_info_t *nat_info)
> {
> - uint32_t hash = nat_range_hash(conn, ct->hash_basis, nat_info);
> + struct conn_key *fwd_key = &conn->key_node[CT_DIR_FWD].key,
> + *rev_key = &conn->key_node[CT_DIR_REV].key;
> union ct_addr min_addr = {0}, max_addr = {0}, addr = {0};
> - bool pat_proto = conn->key.nw_proto == IPPROTO_TCP ||
> - conn->key.nw_proto == IPPROTO_UDP ||
> - conn->key.nw_proto == IPPROTO_SCTP;
> + bool pat_proto = fwd_key->nw_proto == IPPROTO_TCP ||
> + fwd_key->nw_proto == IPPROTO_UDP ||
> + fwd_key->nw_proto == IPPROTO_SCTP;
> uint16_t min_dport, max_dport, curr_dport;
> uint16_t min_sport, max_sport, curr_sport;
> + uint32_t hash;
>
> + hash = nat_range_hash(fwd_key, ct->hash_basis, nat_info);
> min_addr = nat_info->min_addr;
> max_addr = nat_info->max_addr;
>
> - find_addr(conn, &min_addr, &max_addr, &addr, hash,
> - (conn->key.dl_type == htons(ETH_TYPE_IP)), nat_info);
> + find_addr(fwd_key, &min_addr, &max_addr, &addr, hash,
> + (fwd_key->dl_type == htons(ETH_TYPE_IP)), nat_info);
>
> - set_sport_range(nat_info, &conn->key, hash, &curr_sport,
> - &min_sport, &max_sport);
> - set_dport_range(nat_info, &conn->key, hash, &curr_dport,
> - &min_dport, &max_dport);
> + set_sport_range(nat_info, fwd_key, hash, &curr_sport,
> + &min_sport, &max_sport);
> + set_dport_range(nat_info, fwd_key, hash, &curr_dport,
> + &min_dport, &max_dport);
>
> if (pat_proto) {
> - nat_conn->rev_key.src.port = htons(curr_dport);
> - nat_conn->rev_key.dst.port = htons(curr_sport);
> + rev_key->src.port = htons(curr_dport);
> + rev_key->dst.port = htons(curr_sport);
> }
>
> - store_addr_to_key(&addr, &nat_conn->rev_key,
> + store_addr_to_key(&addr, rev_key,
> nat_info->nat_action);
>
> if (!pat_proto) {
> - if (!conn_lookup(ct, &nat_conn->rev_key,
> + if (!conn_lookup(ct, rev_key,
> time_msec(), NULL, NULL)) {
> return true;
> }
> @@ -2492,12 +2409,12 @@ nat_get_unique_tuple(struct conntrack *ct, const
> struct conn *conn,
>
> bool found = false;
> if (nat_info->nat_action & NAT_ACTION_DST_PORT) {
> - found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.src.port,
> + found = nat_get_unique_l4(ct, rev_key, &rev_key->src.port,
> curr_dport, min_dport, max_dport);
> }
>
> if (!found) {
> - found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.dst.port,
> + found = nat_get_unique_l4(ct, rev_key, &rev_key->dst.port,
> curr_sport, min_sport, max_sport);
> }
>
> @@ -2513,9 +2430,10 @@ conn_update(struct conntrack *ct, struct conn *conn,
> struct dp_packet *pkt,
> struct conn_lookup_ctx *ctx, long long now)
> {
> ovs_mutex_lock(&conn->lock);
> + uint8_t nw_proto = conn->key_node[CT_DIR_FWD].key.nw_proto;
> enum ct_update_res update_res =
> - l4_protos[conn->key.nw_proto]->conn_update(ct, conn, pkt, ctx->reply,
> - now);
> + l4_protos[nw_proto]->conn_update(ct, conn, pkt, ctx->reply,
> + now);
> ovs_mutex_unlock(&conn->lock);
> return update_res;
> }
> @@ -2543,10 +2461,7 @@ conn_expiration(const struct conn *conn)
> static bool
> conn_expired(struct conn *conn, long long now)
> {
> - if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
> - return now >= conn_expiration(conn);
> - }
> - return false;
> + return now >= conn_expiration(conn);
> }
>
> static bool
> @@ -2572,9 +2487,7 @@ delete_conn__(struct conn *conn)
> static void
> delete_conn(struct conn *conn)
> {
> - ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
> ovs_mutex_destroy(&conn->lock);
> - free(conn->nat_conn);
> delete_conn__(conn);
> }
>
> @@ -2667,15 +2580,18 @@ static void
> conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
> long long now)
> {
> + const struct conn_key *key = &conn->key_node[CT_DIR_FWD].key,
> + *rev_key = &conn->key_node[CT_DIR_REV].key;
> +
> memset(entry, 0, sizeof *entry);
> - conn_key_to_tuple(&conn->key, &entry->tuple_orig);
> - conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
> + conn_key_to_tuple(key, &entry->tuple_orig);
> + conn_key_to_tuple(rev_key, &entry->tuple_reply);
>
> if (conn->alg_related) {
> conn_key_to_tuple(&conn->parent_key, &entry->tuple_parent);
> }
>
> - entry->zone = conn->key.zone;
> + entry->zone = key->zone;
>
> ovs_mutex_lock(&conn->lock);
> entry->mark = conn->mark;
> @@ -2683,7 +2599,7 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct
> ct_dpif_entry *entry,
>
> long long expiration = conn_expiration(conn) - now;
>
> - struct ct_l4_proto *class = l4_protos[conn->key.nw_proto];
> + struct ct_l4_proto *class = l4_protos[key->nw_proto];
> if (class->conn_get_protoinfo) {
> class->conn_get_protoinfo(conn, &entry->protoinfo);
> }
> @@ -2731,15 +2647,17 @@ conntrack_dump_next(struct conntrack_dump *dump,
> struct ct_dpif_entry *entry)
> if (!cm_node) {
> break;
> }
> + struct conn_key_node *keyn;
> struct conn *conn;
> - INIT_CONTAINER(conn, cm_node, cm_node);
>
> + INIT_CONTAINER(keyn, cm_node, cm_node);
> + conn = CONTAINER_OF(keyn, struct conn, key_node[keyn->dir]);
> if (conn_expired(conn, now)) {
> continue;
> }
>
> - if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
> - (conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
> + if ((!dump->filter_zone || keyn->key.zone == dump->zone) &&
> + (keyn->dir == CT_DIR_FWD)) {
> conn_to_ct_dpif_entry(conn, entry, now);
> return 0;
> }
> @@ -2823,14 +2741,15 @@ conntrack_exp_dump_done(struct conntrack_dump *dump
> OVS_UNUSED)
> int
> conntrack_flush(struct conntrack *ct, const uint16_t *zone)
> {
> + struct conn_key_node *keyn;
> struct conn *conn;
>
> - CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
> - if (conn->conn_type != CT_CONN_TYPE_DEFAULT) {
> + CMAP_FOR_EACH (keyn, cm_node, &ct->conns) {
> + if (keyn->dir != CT_DIR_FWD) {
> continue;
> }
> -
> - if (!zone || *zone == conn->key.zone) {
> + conn = CONTAINER_OF(keyn, struct conn, key_node[keyn->dir]);
> + if (!zone || *zone == keyn->key.zone) {
> conn_clean(ct, conn);
> }
> }
> @@ -2842,18 +2761,18 @@ int
> conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple
> *tuple,
> uint16_t zone)
> {
> - int error = 0;
> struct conn_key key;
> struct conn *conn;
> + int error = 0;
>
> memset(&key, 0, sizeof(key));
> tuple_to_conn_key(tuple, zone, &key);
> conn_lookup(ct, &key, time_msec(), &conn, NULL);
>
> - if (conn && conn->conn_type == CT_CONN_TYPE_DEFAULT) {
> + if (conn) {
> conn_clean(ct, conn);
> } else {
> - VLOG_WARN("Must flush tuple using the original pre-NATed tuple");
> + VLOG_WARN("Tuple not found");
> error = ENOENT;
> }
>
> @@ -2996,50 +2915,52 @@ expectation_create(struct conntrack *ct, ovs_be16
> dst_port,
> const struct conn *parent_conn, bool reply, bool
> src_ip_wc,
> bool skip_nat)
> {
> + const struct conn_key *pconn_key =
> &parent_conn->key_node[CT_DIR_FWD].key,
> + *pconn_rev_key = &parent_conn->key_node[CT_DIR_REV].key;
> union ct_addr src_addr;
> union ct_addr dst_addr;
> union ct_addr alg_nat_repl_addr;
> struct alg_exp_node *alg_exp_node = xzalloc(sizeof *alg_exp_node);
>
> if (reply) {
> - src_addr = parent_conn->key.src.addr;
> - dst_addr = parent_conn->key.dst.addr;
> + src_addr = pconn_key->src.addr;
> + dst_addr = pconn_key->dst.addr;
> alg_exp_node->nat_rpl_dst = true;
> if (skip_nat) {
> alg_nat_repl_addr = dst_addr;
> } else if (parent_conn->nat_action & NAT_ACTION_DST) {
> - alg_nat_repl_addr = parent_conn->rev_key.src.addr;
> + alg_nat_repl_addr = pconn_rev_key->src.addr;
> alg_exp_node->nat_rpl_dst = false;
> } else {
> - alg_nat_repl_addr = parent_conn->rev_key.dst.addr;
> + alg_nat_repl_addr = pconn_rev_key->dst.addr;
> }
> } else {
> - src_addr = parent_conn->rev_key.src.addr;
> - dst_addr = parent_conn->rev_key.dst.addr;
> + src_addr = pconn_rev_key->src.addr;
> + dst_addr = pconn_rev_key->dst.addr;
> alg_exp_node->nat_rpl_dst = false;
> if (skip_nat) {
> alg_nat_repl_addr = src_addr;
> } else if (parent_conn->nat_action & NAT_ACTION_DST) {
> - alg_nat_repl_addr = parent_conn->key.dst.addr;
> + alg_nat_repl_addr = pconn_key->dst.addr;
> alg_exp_node->nat_rpl_dst = true;
> } else {
> - alg_nat_repl_addr = parent_conn->key.src.addr;
> + alg_nat_repl_addr = pconn_key->src.addr;
> }
> }
> if (src_ip_wc) {
> memset(&src_addr, 0, sizeof src_addr);
> }
>
> - alg_exp_node->key.dl_type = parent_conn->key.dl_type;
> - alg_exp_node->key.nw_proto = parent_conn->key.nw_proto;
> - alg_exp_node->key.zone = parent_conn->key.zone;
> + alg_exp_node->key.dl_type = pconn_key->dl_type;
> + alg_exp_node->key.nw_proto = pconn_key->nw_proto;
> + alg_exp_node->key.zone = pconn_key->zone;
> alg_exp_node->key.src.addr = src_addr;
> alg_exp_node->key.dst.addr = dst_addr;
> alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
> alg_exp_node->key.dst.port = dst_port;
> alg_exp_node->parent_mark = parent_conn->mark;
> alg_exp_node->parent_label = parent_conn->label;
> - memcpy(&alg_exp_node->parent_key, &parent_conn->key,
> + memcpy(&alg_exp_node->parent_key, pconn_key,
> sizeof alg_exp_node->parent_key);
> /* Take the write lock here because it is almost 100%
> * likely that the lookup will fail and
> @@ -3291,12 +3212,16 @@ process_ftp_ctl_v4(struct conntrack *ct,
>
> switch (mode) {
> case CT_FTP_MODE_ACTIVE:
> - *v4_addr_rep = conn_for_expectation->rev_key.dst.addr.ipv4;
> - conn_ipv4_addr = conn_for_expectation->key.src.addr.ipv4;
> + *v4_addr_rep =
> + conn_for_expectation->key_node[CT_DIR_REV].key.dst.addr.ipv4;
> + conn_ipv4_addr =
> + conn_for_expectation->key_node[CT_DIR_FWD].key.src.addr.ipv4;
> break;
> case CT_FTP_MODE_PASSIVE:
> - *v4_addr_rep = conn_for_expectation->key.dst.addr.ipv4;
> - conn_ipv4_addr = conn_for_expectation->rev_key.src.addr.ipv4;
> + *v4_addr_rep =
> + conn_for_expectation->key_node[CT_DIR_FWD].key.dst.addr.ipv4;
> + conn_ipv4_addr =
> + conn_for_expectation->key_node[CT_DIR_REV].key.src.addr.ipv4;
> break;
> case CT_TFTP_MODE:
> default:
> @@ -3396,17 +3321,20 @@ process_ftp_ctl_v6(struct conntrack *ct,
>
> switch (*mode) {
> case CT_FTP_MODE_ACTIVE:
> - *v6_addr_rep = conn_for_expectation->rev_key.dst.addr;
> + *v6_addr_rep =
> + conn_for_expectation->key_node[CT_DIR_REV].key.dst.addr;
> /* Although most servers will block this exploit, there may be some
> * less well managed. */
> if (memcmp(&ip6_addr, &v6_addr_rep->ipv6, sizeof ip6_addr) &&
> - memcmp(&ip6_addr, &conn_for_expectation->key.src.addr.ipv6,
> + memcmp(&ip6_addr,
> +
> &conn_for_expectation->key_node[CT_DIR_FWD].key.src.addr.ipv6,
> sizeof ip6_addr)) {
> return CT_FTP_CTL_INVALID;
> }
> break;
> case CT_FTP_MODE_PASSIVE:
> - *v6_addr_rep = conn_for_expectation->key.dst.addr;
> + *v6_addr_rep =
> + conn_for_expectation->key_node[CT_DIR_FWD].key.dst.addr;
> break;
> case CT_TFTP_MODE:
> default:
> @@ -3571,7 +3499,8 @@ handle_tftp_ctl(struct conntrack *ct,
> long long now OVS_UNUSED, enum ftp_ctl_pkt ftp_ctl
> OVS_UNUSED,
> bool nat OVS_UNUSED)
> {
> - expectation_create(ct, conn_for_expectation->key.src.port,
> + expectation_create(ct,
> +
> conn_for_expectation->key_node[CT_DIR_FWD].key.src.port,
> conn_for_expectation,
> !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
> }
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev