On 8/23/23 14:53, Paolo Valerio wrote:
> From: hepeng <[email protected]>
>
> The patch avoids the extra allocation for nat_conn.
> Currently, when doing NAT, the userspace conntrack will use an extra
> conn for the two directions in a flow. However, each conn has actually
> the two keys for both orig and rev directions. This patch introduces a
> key_node[CT_DIRS] member as per Aaron's suggestion in the conn which
> consists of a key, direction, and a cmap_node for hash lookup so
> addressing the feedback received by the original patch [0].
>
> [0]
> https://patchwork.ozlabs.org/project/openvswitch/patch/[email protected]/
>
> Signed-off-by: Peng He <[email protected]>
> Co-authored-by: Paolo Valerio <[email protected]>
> Signed-off-by: Paolo Valerio <[email protected]>
> ---
> v2:
> - use enum value instead of bool (Aaron).
> - s/conn_for_expectation/conn_for_exp/ in process_ftp_ctl_v6()
> to avoid long line.
> - removed CT_CONN_TYPE_* reference in two comments.
> ---
> lib/conntrack-private.h | 19 +--
> lib/conntrack-tp.c | 6 +
> lib/conntrack.c | 350
> +++++++++++++++++++----------------------------
> 3 files changed, 155 insertions(+), 220 deletions(-)
>
> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
> index bb326868e..3fd5fccd3 100644
> --- a/lib/conntrack-private.h
> +++ b/lib/conntrack-private.h
> @@ -49,6 +49,12 @@ struct ct_endpoint {
> * hashing in ct_endpoint_hash_add(). */
> BUILD_ASSERT_DECL(sizeof(struct ct_endpoint) == sizeof(union ct_addr) + 4);
>
> +enum key_dir {
> + CT_DIR_FWD = 0,
> + CT_DIR_REV,
> + CT_DIRS,
> +};
> +
> /* Changes to this structure need to be reflected in conn_key_hash()
> * and conn_key_cmp(). */
> struct conn_key {
> @@ -112,20 +118,18 @@ enum ct_timeout {
>
> #define N_EXP_LISTS 100
>
> -enum OVS_PACKED_ENUM ct_conn_type {
> - CT_CONN_TYPE_DEFAULT,
> - CT_CONN_TYPE_UN_NAT,
> +struct conn_key_node {
> + enum key_dir dir;
> + struct conn_key key;
> + struct cmap_node cm_node;
> };
This structure and the whole business of adding the connection
to cmap twice with different hashes is bothering me, but I really
don't have a better solution for this, so let it be... :)
Just to refresh my memory, we do that because original and reply
tuples can be completely different due to NAT, so the hashing
being symmetric doesn't help in this case, right?
>
> struct conn {
> /* Immutable data. */
> - struct conn_key key;
> - struct conn_key rev_key;
> + struct conn_key_node key_node[CT_DIRS];
> struct conn_key parent_key; /* Only used for orig_tuple support. */
> - struct cmap_node cm_node;
> uint16_t nat_action;
> char *alg;
> - struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */
> atomic_flag reclaimed; /* False during the lifetime of the connection,
> * True as soon as a thread has started freeing
> * its memory. */
> @@ -150,7 +154,6 @@ struct conn {
>
> /* Immutable data. */
> bool alg_related; /* True if alg data connection. */
> - enum ct_conn_type conn_type;
>
> uint32_t tp_id; /* Timeout policy ID. */
> };
> diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
> index 89cb2704a..2149fdc73 100644
> --- a/lib/conntrack-tp.c
> +++ b/lib/conntrack-tp.c
> @@ -253,7 +253,8 @@ conn_update_expiration(struct conntrack *ct, struct conn
> *conn,
> }
> VLOG_DBG_RL(&rl, "Update timeout %s zone=%u with policy id=%d "
> "val=%u sec.",
> - ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
> + ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone,
> + conn->tp_id, val);
>
> atomic_store_relaxed(&conn->expiration, now + val * 1000);
> }
> @@ -273,7 +274,8 @@ conn_init_expiration(struct conntrack *ct, struct conn
> *conn,
> }
>
> VLOG_DBG_RL(&rl, "Init timeout %s zone=%u with policy id=%d val=%u sec.",
> - ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
> + ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone,
> + conn->tp_id, val);
>
> conn->expiration = now + val * 1000;
> }
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 5f1176d33..f75f9a8f1 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -113,8 +113,7 @@ static void set_label(struct dp_packet *, struct conn *,
> static void *clean_thread_main(void *f_);
>
> static bool
> -nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
> - struct conn *nat_conn,
> +nat_get_unique_tuple(struct conntrack *ct, struct conn *conn,
> const struct nat_action_info_t *nat_info);
>
> static uint8_t
> @@ -208,7 +207,7 @@ static alg_helper alg_helpers[] = {
> #define ALG_WC_SRC_PORT 0
>
> /* If the total number of connections goes above this value, no new
> connections
> - * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
> + * are accepted. */
> #define DEFAULT_N_CONN_LIMIT 3000000
>
> /* Does a member by member comparison of two conn_keys; this
> @@ -234,61 +233,6 @@ conn_key_cmp(const struct conn_key *key1, const struct
> conn_key *key2)
> return 1;
> }
>
> -static void
> -ct_print_conn_info(const struct conn *c, const char *log_msg,
> - enum vlog_level vll, bool force, bool rl_on)
> -{
> -#define CT_VLOG(RL_ON, LEVEL, ...) \
> - do { \
> - if (RL_ON) { \
> - static struct vlog_rate_limit rl_ = VLOG_RATE_LIMIT_INIT(5, 5); \
> - vlog_rate_limit(&this_module, LEVEL, &rl_, __VA_ARGS__); \
> - } else { \
> - vlog(&this_module, LEVEL, __VA_ARGS__); \
> - } \
> - } while (0)
> -
> - if (OVS_UNLIKELY(force || vlog_is_enabled(&this_module, vll))) {
> - if (c->key.dl_type == htons(ETH_TYPE_IP)) {
> - CT_VLOG(rl_on, vll, "%s: src ip "IP_FMT" dst ip "IP_FMT" rev src
> "
> - "ip "IP_FMT" rev dst ip "IP_FMT" src/dst ports "
> - "%"PRIu16"/%"PRIu16" rev src/dst ports "
> - "%"PRIu16"/%"PRIu16" zone/rev zone "
> - "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
> - "%"PRIu8"/%"PRIu8, log_msg,
> - IP_ARGS(c->key.src.addr.ipv4),
> - IP_ARGS(c->key.dst.addr.ipv4),
> - IP_ARGS(c->rev_key.src.addr.ipv4),
> - IP_ARGS(c->rev_key.dst.addr.ipv4),
> - ntohs(c->key.src.port), ntohs(c->key.dst.port),
> - ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
> - c->key.zone, c->rev_key.zone, c->key.nw_proto,
> - c->rev_key.nw_proto);
> - } else {
> - char ip6_s[INET6_ADDRSTRLEN];
> - inet_ntop(AF_INET6, &c->key.src.addr.ipv6, ip6_s, sizeof ip6_s);
> - char ip6_d[INET6_ADDRSTRLEN];
> - inet_ntop(AF_INET6, &c->key.dst.addr.ipv6, ip6_d, sizeof ip6_d);
> - char ip6_rs[INET6_ADDRSTRLEN];
> - inet_ntop(AF_INET6, &c->rev_key.src.addr.ipv6, ip6_rs,
> - sizeof ip6_rs);
> - char ip6_rd[INET6_ADDRSTRLEN];
> - inet_ntop(AF_INET6, &c->rev_key.dst.addr.ipv6, ip6_rd,
> - sizeof ip6_rd);
> -
> - CT_VLOG(rl_on, vll, "%s: src ip %s dst ip %s rev src ip %s"
> - " rev dst ip %s src/dst ports %"PRIu16"/%"PRIu16
> - " rev src/dst ports %"PRIu16"/%"PRIu16" zone/rev zone "
> - "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
> - "%"PRIu8"/%"PRIu8, log_msg, ip6_s, ip6_d, ip6_rs,
> - ip6_rd, ntohs(c->key.src.port), ntohs(c->key.dst.port),
> - ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
> - c->key.zone, c->rev_key.zone, c->key.nw_proto,
> - c->rev_key.nw_proto);
> - }
> - }
> -}
> -
> /* Initializes the connection tracker 'ct'. The caller is responsible for
> * calling 'conntrack_destroy()', when the instance is not needed anymore */
> struct conntrack *
> @@ -477,28 +421,27 @@ conn_clean__(struct conntrack *ct, struct conn *conn)
> uint32_t hash;
>
> if (conn->alg) {
> - expectation_clean(ct, &conn->key);
> + expectation_clean(ct, &conn->key_node[CT_DIR_FWD].key);
> }
>
> - hash = conn_key_hash(&conn->key, ct->hash_basis);
> - cmap_remove(&ct->conns, &conn->cm_node, hash);
> + hash = conn_key_hash(&conn->key_node[CT_DIR_FWD].key, ct->hash_basis);
> + cmap_remove(&ct->conns, &conn->key_node[CT_DIR_FWD].cm_node, hash);
>
> - if (conn->nat_conn) {
> - hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis);
> - cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash);
> + if (conn->nat_action) {
> + hash = conn_key_hash(&conn->key_node[CT_DIR_REV].key,
> + ct->hash_basis);
> + cmap_remove(&ct->conns, &conn->key_node[CT_DIR_REV].cm_node, hash);
> }
>
> rculist_remove(&conn->node);
> }
>
> -/* Must be called with 'conn' of 'conn_type' CT_CONN_TYPE_DEFAULT. Also
> - * removes the associated nat 'conn' from the lookup datastructures. */
> +/* Also removes the associated nat 'conn' from the lookup
> + datastructures. */
> static void
> conn_clean(struct conntrack *ct, struct conn *conn)
> OVS_EXCLUDED(conn->lock, ct->ct_lock)
> {
> - ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
> -
> if (atomic_flag_test_and_set(&conn->reclaimed)) {
> return;
> }
> @@ -585,34 +528,34 @@ conn_key_lookup(struct conntrack *ct, const struct
> conn_key *key,
> uint32_t hash, long long now, struct conn **conn_out,
> bool *reply)
> {
> - struct conn *conn;
> + struct conn_key_node *keyn;
> + struct conn *conn = NULL;
> bool found = false;
>
> - CMAP_FOR_EACH_WITH_HASH (conn, cm_node, hash, &ct->conns) {
> + CMAP_FOR_EACH_WITH_HASH (keyn, cm_node, hash, &ct->conns) {
> + conn = CONTAINER_OF(keyn, struct conn, key_node[keyn->dir]);
Hmm, that's a neat trick, but I'm not sure we're allowed to do that.
CONTAINER_OF involves offsetof, and offsetof according to documentation
supposed to expand into 'integer constant expression'. It is allowed
to have nested types and array indexes, but I was unable to find any
reference to using a variable as an index. And there is no way for an
expression that contains a variable to be expanded into integer constant
expression. So, I'm wondering if this line constitutes an undefined
behavior?
Me might have to put it into an if expression in order to have a
compile-time constant offsetof. i.e.
if (keyn->dir == CT_DIR_FWD) {
conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]);
} else {
conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_REV]);
}
Same for all other occurances.
Thoughts?
> if (conn_expired(conn, now)) {
> continue;
> }
> - if (!conn_key_cmp(&conn->key, key)) {
> - found = true;
> - if (reply) {
> - *reply = false;
> - }
> - break;
> - }
> - if (!conn_key_cmp(&conn->rev_key, key)) {
> - found = true;
> - if (reply) {
> - *reply = true;
> +
> + for (int i = CT_DIR_FWD; i < CT_DIRS; i++) {
> + if (!conn_key_cmp(&conn->key_node[i].key, key)) {
> + found = true;
> + if (reply) {
> + *reply = i;
This is an int to bool assignment, it should be = (i == CT_DIR_REV);
> + }
> + goto out_found;
> }
> - break;
> }
> }
>
> +out_found:
> if (found && conn_out) {
> *conn_out = conn;
> } else if (conn_out) {
> *conn_out = NULL;
> }
> +
> return found;
> }
>
> @@ -646,7 +589,7 @@ write_ct_md(struct dp_packet *pkt, uint16_t zone, const
> struct conn *conn,
> if (conn->alg_related) {
> key = &conn->parent_key;
> } else {
> - key = &conn->key;
> + key = &conn->key_node[CT_DIR_FWD].key;
> }
> } else if (alg_exp) {
> pkt->md.ct_mark = alg_exp->parent_mark;
> @@ -877,7 +820,8 @@ nat_inner_packet(struct dp_packet *pkt, struct conn_key
> *key,
> static void
> nat_packet(struct dp_packet *pkt, struct conn *conn, bool reply, bool
> related)
> {
> - struct conn_key *key = reply ? &conn->key : &conn->rev_key;
> + enum key_dir dir = reply ? CT_DIR_FWD : CT_DIR_REV;
> + struct conn_key *key = &conn->key_node[dir].key;
> uint16_t nat_action = reply ? nat_action_reverse(conn->nat_action)
> : conn->nat_action;
>
> @@ -911,7 +855,7 @@ conn_seq_skew_set(struct conntrack *ct, const struct conn
> *conn_in,
> {
> struct conn *conn;
>
> - conn_lookup(ct, &conn_in->key, now, &conn, NULL);
> + conn_lookup(ct, &conn_in->key_node[CT_DIR_FWD].key, now, &conn, NULL);
> if (conn && seq_skew) {
> conn->seq_skew = seq_skew;
> conn->seq_skew_dir = seq_skew_dir;
> @@ -947,7 +891,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet
> *pkt,
> OVS_REQUIRES(ct->ct_lock)
> {
> struct conn *nc = NULL;
> - struct conn *nat_conn = NULL;
>
> if (!valid_new(pkt, &ctx->key)) {
> pkt->md.ct_state = CS_INVALID;
> @@ -961,6 +904,7 @@ conn_not_found(struct conntrack *ct, struct dp_packet
> *pkt,
> }
>
> if (commit) {
> + struct conn_key_node *fwd_key_node, *rev_key_node;
> struct zone_limit *zl = zone_limit_lookup_or_default(ct,
> ctx->key.zone);
> if (zl && atomic_count_get(&zl->czl.count) >= zl->czl.limit) {
> @@ -975,9 +919,12 @@ conn_not_found(struct conntrack *ct, struct dp_packet
> *pkt,
> }
>
> nc = new_conn(ct, pkt, &ctx->key, now, tp_id);
> - memcpy(&nc->key, &ctx->key, sizeof nc->key);
> - memcpy(&nc->rev_key, &nc->key, sizeof nc->rev_key);
> - conn_key_reverse(&nc->rev_key);
> + fwd_key_node = &nc->key_node[CT_DIR_FWD];
> + rev_key_node = &nc->key_node[CT_DIR_REV];
> + memcpy(&fwd_key_node->key, &ctx->key, sizeof fwd_key_node->key);
> + memcpy(&rev_key_node->key, &fwd_key_node->key,
> + sizeof rev_key_node->key);
> + conn_key_reverse(&rev_key_node->key);
>
> if (ct_verify_helper(helper, ct_alg_ctl)) {
> nc->alg = nullable_xstrdup(helper);
> @@ -992,46 +939,33 @@ conn_not_found(struct conntrack *ct, struct dp_packet
> *pkt,
>
> if (nat_action_info) {
> nc->nat_action = nat_action_info->nat_action;
> - nat_conn = xzalloc(sizeof *nat_conn);
>
> if (alg_exp) {
> if (alg_exp->nat_rpl_dst) {
> - nc->rev_key.dst.addr = alg_exp->alg_nat_repl_addr;
> + rev_key_node->key.dst.addr = alg_exp->alg_nat_repl_addr;
> nc->nat_action = NAT_ACTION_SRC;
> } else {
> - nc->rev_key.src.addr = alg_exp->alg_nat_repl_addr;
> + rev_key_node->key.src.addr = alg_exp->alg_nat_repl_addr;
> nc->nat_action = NAT_ACTION_DST;
> }
> } else {
> - memcpy(nat_conn, nc, sizeof *nat_conn);
> - bool nat_res = nat_get_unique_tuple(ct, nc, nat_conn,
> - nat_action_info);
> -
> + bool nat_res = nat_get_unique_tuple(ct, nc, nat_action_info);
> if (!nat_res) {
> goto nat_res_exhaustion;
> }
> -
> - /* Update nc with nat adjustments made to nat_conn by
> - * nat_get_unique_tuple(). */
> - memcpy(nc, nat_conn, sizeof *nc);
> }
>
> nat_packet(pkt, nc, false, ctx->icmp_related);
> - memcpy(&nat_conn->key, &nc->rev_key, sizeof nat_conn->key);
> - memcpy(&nat_conn->rev_key, &nc->key, sizeof nat_conn->rev_key);
> - nat_conn->conn_type = CT_CONN_TYPE_UN_NAT;
> - nat_conn->nat_action = 0;
> - nat_conn->alg = NULL;
> - nat_conn->nat_conn = NULL;
> - uint32_t nat_hash = conn_key_hash(&nat_conn->key,
> ct->hash_basis);
> - cmap_insert(&ct->conns, &nat_conn->cm_node, nat_hash);
> + uint32_t rev_hash = conn_key_hash(&rev_key_node->key,
> + ct->hash_basis);
> + cmap_insert(&ct->conns, &rev_key_node->cm_node, rev_hash);
> }
>
> - nc->nat_conn = nat_conn;
> ovs_mutex_init_adaptive(&nc->lock);
> - nc->conn_type = CT_CONN_TYPE_DEFAULT;
> atomic_flag_clear(&nc->reclaimed);
> - cmap_insert(&ct->conns, &nc->cm_node, ctx->hash);
> + fwd_key_node->dir = CT_DIR_FWD;
> + rev_key_node->dir = CT_DIR_REV;
> + cmap_insert(&ct->conns, &fwd_key_node->cm_node, ctx->hash);
> conn_expire_push_front(ct, nc);
> atomic_count_inc(&ct->n_conn);
> ctx->conn = nc; /* For completeness. */
> @@ -1052,7 +986,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet
> *pkt,
> * firewall rules or a separate firewall. Also using zone partitioning
> * can limit DoS impact. */
> nat_res_exhaustion:
> - free(nat_conn);
> delete_conn__(nc);
> static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
> VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
> @@ -1065,7 +998,6 @@ conn_update_state(struct conntrack *ct, struct dp_packet
> *pkt,
> struct conn_lookup_ctx *ctx, struct conn *conn,
> long long now)
> {
> - ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
> bool create_new_conn = false;
>
> if (ctx->icmp_related) {
> @@ -1092,7 +1024,8 @@ conn_update_state(struct conntrack *ct, struct
> dp_packet *pkt,
> pkt->md.ct_state = CS_INVALID;
> break;
> case CT_UPDATE_NEW:
> - if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
> + if (conn_lookup(ct, &conn->key_node[CT_DIR_FWD].key,
> + now, NULL, NULL)) {
> conn_force_expire(conn);
> }
> create_new_conn = true;
> @@ -1268,8 +1201,10 @@ initial_conn_lookup(struct conntrack *ct, struct
> conn_lookup_ctx *ctx,
>
> if (natted) {
> if (OVS_LIKELY(ctx->conn)) {
> + enum key_dir dir;
> ctx->reply = !ctx->reply;
> - ctx->key = ctx->reply ? ctx->conn->rev_key : ctx->conn->key;
> + dir = ctx->reply ? CT_DIR_REV : CT_DIR_FWD;
> + ctx->key = ctx->conn->key_node[dir].key;
> ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
> } else {
> /* A lookup failure does not necessarily imply that an
> @@ -1302,31 +1237,13 @@ process_one(struct conntrack *ct, struct dp_packet
> *pkt,
>
> /* Delete found entry if in wrong direction. 'force' implies commit. */
> if (OVS_UNLIKELY(force && ctx->reply && conn)) {
> - if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
> + if (conn_lookup(ct, &conn->key_node[CT_DIR_FWD].key,
> + now, NULL, NULL)) {
> conn_force_expire(conn);
> }
> conn = NULL;
> }
>
> - if (OVS_LIKELY(conn)) {
> - if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
> -
> - ctx->reply = true;
> - struct conn *rev_conn = conn; /* Save for debugging. */
> - uint32_t hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
> - conn_key_lookup(ct, &ctx->key, hash, now, &conn, &ctx->reply);
> -
> - if (!conn) {
> - pkt->md.ct_state |= CS_INVALID;
> - write_ct_md(pkt, zone, NULL, NULL, NULL);
> - char *log_msg = xasprintf("Missing parent conn %p",
> rev_conn);
> - ct_print_conn_info(rev_conn, log_msg, VLL_INFO, true, true);
> - free(log_msg);
> - return;
> - }
> - }
> - }
> -
> enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, tp_src, tp_dst,
> helper);
>
> @@ -1419,8 +1336,9 @@ conntrack_execute(struct conntrack *ct, struct
> dp_packet_batch *pkt_batch,
> struct conn *conn = packet->md.conn;
> if (OVS_UNLIKELY(packet->md.ct_state == CS_INVALID)) {
> write_ct_md(packet, zone, NULL, NULL, NULL);
> - } else if (conn && conn->key.zone == zone && !force
> - && !get_alg_ctl_type(packet, tp_src, tp_dst, helper)) {
> + } else if (conn &&
> + conn->key_node[CT_DIR_FWD].key.zone == zone && !force &&
> + !get_alg_ctl_type(packet, tp_src, tp_dst, helper)) {
> process_one_fast(zone, setmark, setlabel, nat_action_info,
> conn, packet);
> } else if (OVS_UNLIKELY(!conn_key_extract(ct, packet, dl_type, &ctx,
> @@ -2269,7 +2187,7 @@ nat_ipv6_addr_increment(struct in6_addr *ipv6, uint32_t
> increment)
> }
>
> static uint32_t
> -nat_range_hash(const struct conn *conn, uint32_t basis,
> +nat_range_hash(const struct conn_key *key, uint32_t basis,
> const struct nat_action_info_t *nat_info)
> {
> uint32_t hash = basis;
> @@ -2279,11 +2197,11 @@ nat_range_hash(const struct conn *conn, uint32_t
> basis,
> hash = hash_add(hash,
> ((uint32_t) nat_info->max_port << 16)
> | nat_info->min_port);
> - hash = ct_endpoint_hash_add(hash, &conn->key.src);
> - hash = ct_endpoint_hash_add(hash, &conn->key.dst);
> - hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type);
> - hash = hash_add(hash, conn->key.nw_proto);
> - hash = hash_add(hash, conn->key.zone);
> + hash = ct_endpoint_hash_add(hash, &key->src);
> + hash = ct_endpoint_hash_add(hash, &key->dst);
> + hash = hash_add(hash, (OVS_FORCE uint32_t) key->dl_type);
> + hash = hash_add(hash, key->nw_proto);
> + hash = hash_add(hash, key->zone);
>
> /* The purpose of the second parameter is to distinguish hashes of data
> of
> * different length; our data always has the same length so there is no
> @@ -2357,7 +2275,7 @@ get_addr_in_range(union ct_addr *min, union ct_addr
> *max,
> }
>
> static void
> -find_addr(const struct conn *conn, union ct_addr *min,
> +find_addr(const struct conn_key *key, union ct_addr *min,
> union ct_addr *max, union ct_addr *curr,
> uint32_t hash, bool ipv4,
> const struct nat_action_info_t *nat_info)
> @@ -2367,9 +2285,9 @@ find_addr(const struct conn *conn, union ct_addr *min,
> /* All-zero case. */
> if (!memcmp(min, &zero_ip, sizeof *min)) {
> if (nat_info->nat_action & NAT_ACTION_SRC) {
> - *curr = conn->key.src.addr;
> + *curr = key->src.addr;
> } else if (nat_info->nat_action & NAT_ACTION_DST) {
> - *curr = conn->key.dst.addr;
> + *curr = key->dst.addr;
> }
> } else {
> get_addr_in_range(min, max, curr, hash, ipv4);
> @@ -2388,7 +2306,7 @@ store_addr_to_key(union ct_addr *addr, struct conn_key
> *key,
> }
>
> static bool
> -nat_get_unique_l4(struct conntrack *ct, struct conn *nat_conn,
> +nat_get_unique_l4(struct conntrack *ct, struct conn_key *rev_key,
> ovs_be16 *port, uint16_t curr, uint16_t min,
> uint16_t max)
> {
> @@ -2411,7 +2329,7 @@ another_round:
> }
>
> *port = htons(curr);
> - if (!conn_lookup(ct, &nat_conn->rev_key,
> + if (!conn_lookup(ct, rev_key,
> time_msec(), NULL, NULL)) {
Can be a single line.
> return true;
> }
> @@ -2450,39 +2368,41 @@ another_round:
> *
> * If none can be found, return exhaustion to the caller. */
> static bool
> -nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
> - struct conn *nat_conn,
> +nat_get_unique_tuple(struct conntrack *ct, struct conn *conn,
> const struct nat_action_info_t *nat_info)
> {
> - uint32_t hash = nat_range_hash(conn, ct->hash_basis, nat_info);
> + struct conn_key *fwd_key = &conn->key_node[CT_DIR_FWD].key,
> + *rev_key = &conn->key_node[CT_DIR_REV].key;
This style of initialization doesn't look good. Please, just
specify a type on a second line as well.
> union ct_addr min_addr = {0}, max_addr = {0}, addr = {0};
> - bool pat_proto = conn->key.nw_proto == IPPROTO_TCP ||
> - conn->key.nw_proto == IPPROTO_UDP ||
> - conn->key.nw_proto == IPPROTO_SCTP;
> + bool pat_proto = fwd_key->nw_proto == IPPROTO_TCP ||
> + fwd_key->nw_proto == IPPROTO_UDP ||
> + fwd_key->nw_proto == IPPROTO_SCTP;
> uint16_t min_dport, max_dport, curr_dport;
> uint16_t min_sport, max_sport, curr_sport;
> + uint32_t hash;
>
> + hash = nat_range_hash(fwd_key, ct->hash_basis, nat_info);
> min_addr = nat_info->min_addr;
> max_addr = nat_info->max_addr;
>
> - find_addr(conn, &min_addr, &max_addr, &addr, hash,
> - (conn->key.dl_type == htons(ETH_TYPE_IP)), nat_info);
> + find_addr(fwd_key, &min_addr, &max_addr, &addr, hash,
> + (fwd_key->dl_type == htons(ETH_TYPE_IP)), nat_info);
>
> - set_sport_range(nat_info, &conn->key, hash, &curr_sport,
> - &min_sport, &max_sport);
> - set_dport_range(nat_info, &conn->key, hash, &curr_dport,
> - &min_dport, &max_dport);
> + set_sport_range(nat_info, fwd_key, hash, &curr_sport,
> + &min_sport, &max_sport);
> + set_dport_range(nat_info, fwd_key, hash, &curr_dport,
> + &min_dport, &max_dport);
The continuation lines above should start right after '(' on
a previous line. i.e. 1 space to the left.
>
> if (pat_proto) {
> - nat_conn->rev_key.src.port = htons(curr_dport);
> - nat_conn->rev_key.dst.port = htons(curr_sport);
> + rev_key->src.port = htons(curr_dport);
> + rev_key->dst.port = htons(curr_sport);
> }
>
> - store_addr_to_key(&addr, &nat_conn->rev_key,
> + store_addr_to_key(&addr, rev_key,
> nat_info->nat_action);
This one can be a single line.
>
> if (!pat_proto) {
> - if (!conn_lookup(ct, &nat_conn->rev_key,
> + if (!conn_lookup(ct, rev_key,
> time_msec(), NULL, NULL)) {
> return true;
> }
Not sure if we need to change that now, but this whole body of the
if statement can be a "return !conn_lookup();"
> @@ -2492,12 +2412,12 @@ nat_get_unique_tuple(struct conntrack *ct, const
> struct conn *conn,
>
> bool found = false;
> if (nat_info->nat_action & NAT_ACTION_DST_PORT) {
> - found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.src.port,
> + found = nat_get_unique_l4(ct, rev_key, &rev_key->src.port,
> curr_dport, min_dport, max_dport);
> }
>
> if (!found) {
> - found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.dst.port,
> + found = nat_get_unique_l4(ct, rev_key, &rev_key->dst.port,
> curr_sport, min_sport, max_sport);
> }
>
> @@ -2513,9 +2433,10 @@ conn_update(struct conntrack *ct, struct conn *conn,
> struct dp_packet *pkt,
> struct conn_lookup_ctx *ctx, long long now)
> {
> ovs_mutex_lock(&conn->lock);
> + uint8_t nw_proto = conn->key_node[CT_DIR_FWD].key.nw_proto;
> enum ct_update_res update_res =
> - l4_protos[conn->key.nw_proto]->conn_update(ct, conn, pkt, ctx->reply,
> - now);
> + l4_protos[nw_proto]->conn_update(ct, conn, pkt, ctx->reply,
> + now);
The 'now' can be on a previous line.
> ovs_mutex_unlock(&conn->lock);
> return update_res;
> }
> @@ -2543,10 +2464,7 @@ conn_expiration(const struct conn *conn)
> static bool
> conn_expired(struct conn *conn, long long now)
Can we make it have a const ponter as input?
> {
> - if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
> - return now >= conn_expiration(conn);
> - }
> - return false;
> + return now >= conn_expiration(conn);
> }
>
> static bool
> @@ -2572,9 +2490,7 @@ delete_conn__(struct conn *conn)
> static void
> delete_conn(struct conn *conn)
> {
> - ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
> ovs_mutex_destroy(&conn->lock);
> - free(conn->nat_conn);
> delete_conn__(conn);
> }
>
> @@ -2667,15 +2583,18 @@ static void
> conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
> long long now)
> {
> + const struct conn_key *key = &conn->key_node[CT_DIR_FWD].key,
> + *rev_key = &conn->key_node[CT_DIR_REV].key;
Same here. Put the type on the second line as well. Or define them
and initialize later.
> +
> memset(entry, 0, sizeof *entry);
> - conn_key_to_tuple(&conn->key, &entry->tuple_orig);
> - conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
> + conn_key_to_tuple(key, &entry->tuple_orig);
> + conn_key_to_tuple(rev_key, &entry->tuple_reply);
>
> if (conn->alg_related) {
> conn_key_to_tuple(&conn->parent_key, &entry->tuple_parent);
> }
>
> - entry->zone = conn->key.zone;
> + entry->zone = key->zone;
>
> ovs_mutex_lock(&conn->lock);
> entry->mark = conn->mark;
> @@ -2683,7 +2602,7 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct
> ct_dpif_entry *entry,
>
> long long expiration = conn_expiration(conn) - now;
>
> - struct ct_l4_proto *class = l4_protos[conn->key.nw_proto];
> + struct ct_l4_proto *class = l4_protos[key->nw_proto];
> if (class->conn_get_protoinfo) {
> class->conn_get_protoinfo(conn, &entry->protoinfo);
> }
> @@ -2731,15 +2650,17 @@ conntrack_dump_next(struct conntrack_dump *dump,
> struct ct_dpif_entry *entry)
> if (!cm_node) {
> break;
> }
> + struct conn_key_node *keyn;
> struct conn *conn;
> - INIT_CONTAINER(conn, cm_node, cm_node);
>
> + INIT_CONTAINER(keyn, cm_node, cm_node);
> + conn = CONTAINER_OF(keyn, struct conn, key_node[keyn->dir]);
integer constant expression.
> if (conn_expired(conn, now)) {
> continue;
> }
>
> - if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
> - (conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
> + if ((!dump->filter_zone || keyn->key.zone == dump->zone) &&
> + (keyn->dir == CT_DIR_FWD)) {
Can we check for CT_DIR_FWD earlier in the loop? It's the easiest
thing to check and we're not using connections found by their REV key.
This way we may avoid potential offsetof UB and will save some time
on pointless expiration check.
> conn_to_ct_dpif_entry(conn, entry, now);
> return 0;
> }
> @@ -2823,14 +2744,15 @@ conntrack_exp_dump_done(struct conntrack_dump *dump
> OVS_UNUSED)
> int
> conntrack_flush(struct conntrack *ct, const uint16_t *zone)
> {
> + struct conn_key_node *keyn;
> struct conn *conn;
>
> - CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
> - if (conn->conn_type != CT_CONN_TYPE_DEFAULT) {
> + CMAP_FOR_EACH (keyn, cm_node, &ct->conns) {
> + if (keyn->dir != CT_DIR_FWD) {
> continue;
> }
> -
> - if (!zone || *zone == conn->key.zone) {
> + conn = CONTAINER_OF(keyn, struct conn, key_node[keyn->dir]);
integer constant expression.
And we actually know that keyn->dir equals CT_DIR_FWD here.
> + if (!zone || *zone == keyn->key.zone) {
> conn_clean(ct, conn);
> }
> }
> @@ -2842,18 +2764,18 @@ int
> conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple
> *tuple,
> uint16_t zone)
> {
> - int error = 0;
> struct conn_key key;
> struct conn *conn;
> + int error = 0;
>
> memset(&key, 0, sizeof(key));
> tuple_to_conn_key(tuple, zone, &key);
> conn_lookup(ct, &key, time_msec(), &conn, NULL);
>
> - if (conn && conn->conn_type == CT_CONN_TYPE_DEFAULT) {
> + if (conn) {
> conn_clean(ct, conn);
> } else {
> - VLOG_WARN("Must flush tuple using the original pre-NATed tuple");
> + VLOG_WARN("Tuple not found");
> error = ENOENT;
> }
>
> @@ -2996,50 +2918,52 @@ expectation_create(struct conntrack *ct, ovs_be16
> dst_port,
> const struct conn *parent_conn, bool reply, bool
> src_ip_wc,
> bool skip_nat)
> {
> + const struct conn_key *pconn_key =
> &parent_conn->key_node[CT_DIR_FWD].key,
> + *pconn_rev_key = &parent_conn->key_node[CT_DIR_REV].key;
Ditto.
> union ct_addr src_addr;
> union ct_addr dst_addr;
> union ct_addr alg_nat_repl_addr;
> struct alg_exp_node *alg_exp_node = xzalloc(sizeof *alg_exp_node);
>
> if (reply) {
> - src_addr = parent_conn->key.src.addr;
> - dst_addr = parent_conn->key.dst.addr;
> + src_addr = pconn_key->src.addr;
> + dst_addr = pconn_key->dst.addr;
> alg_exp_node->nat_rpl_dst = true;
> if (skip_nat) {
> alg_nat_repl_addr = dst_addr;
> } else if (parent_conn->nat_action & NAT_ACTION_DST) {
> - alg_nat_repl_addr = parent_conn->rev_key.src.addr;
> + alg_nat_repl_addr = pconn_rev_key->src.addr;
> alg_exp_node->nat_rpl_dst = false;
> } else {
> - alg_nat_repl_addr = parent_conn->rev_key.dst.addr;
> + alg_nat_repl_addr = pconn_rev_key->dst.addr;
> }
> } else {
> - src_addr = parent_conn->rev_key.src.addr;
> - dst_addr = parent_conn->rev_key.dst.addr;
> + src_addr = pconn_rev_key->src.addr;
> + dst_addr = pconn_rev_key->dst.addr;
> alg_exp_node->nat_rpl_dst = false;
> if (skip_nat) {
> alg_nat_repl_addr = src_addr;
> } else if (parent_conn->nat_action & NAT_ACTION_DST) {
> - alg_nat_repl_addr = parent_conn->key.dst.addr;
> + alg_nat_repl_addr = pconn_key->dst.addr;
> alg_exp_node->nat_rpl_dst = true;
> } else {
> - alg_nat_repl_addr = parent_conn->key.src.addr;
> + alg_nat_repl_addr = pconn_key->src.addr;
> }
> }
> if (src_ip_wc) {
> memset(&src_addr, 0, sizeof src_addr);
> }
>
> - alg_exp_node->key.dl_type = parent_conn->key.dl_type;
> - alg_exp_node->key.nw_proto = parent_conn->key.nw_proto;
> - alg_exp_node->key.zone = parent_conn->key.zone;
> + alg_exp_node->key.dl_type = pconn_key->dl_type;
> + alg_exp_node->key.nw_proto = pconn_key->nw_proto;
> + alg_exp_node->key.zone = pconn_key->zone;
> alg_exp_node->key.src.addr = src_addr;
> alg_exp_node->key.dst.addr = dst_addr;
> alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
> alg_exp_node->key.dst.port = dst_port;
> alg_exp_node->parent_mark = parent_conn->mark;
> alg_exp_node->parent_label = parent_conn->label;
> - memcpy(&alg_exp_node->parent_key, &parent_conn->key,
> + memcpy(&alg_exp_node->parent_key, pconn_key,
> sizeof alg_exp_node->parent_key);
> /* Take the write lock here because it is almost 100%
> * likely that the lookup will fail and
> @@ -3291,12 +3215,16 @@ process_ftp_ctl_v4(struct conntrack *ct,
>
> switch (mode) {
> case CT_FTP_MODE_ACTIVE:
> - *v4_addr_rep = conn_for_expectation->rev_key.dst.addr.ipv4;
> - conn_ipv4_addr = conn_for_expectation->key.src.addr.ipv4;
> + *v4_addr_rep =
> + conn_for_expectation->key_node[CT_DIR_REV].key.dst.addr.ipv4;
> + conn_ipv4_addr =
> + conn_for_expectation->key_node[CT_DIR_FWD].key.src.addr.ipv4;
> break;
> case CT_FTP_MODE_PASSIVE:
> - *v4_addr_rep = conn_for_expectation->key.dst.addr.ipv4;
> - conn_ipv4_addr = conn_for_expectation->rev_key.src.addr.ipv4;
> + *v4_addr_rep =
> + conn_for_expectation->key_node[CT_DIR_FWD].key.dst.addr.ipv4;
> + conn_ipv4_addr =
> + conn_for_expectation->key_node[CT_DIR_REV].key.src.addr.ipv4;
> break;
> case CT_TFTP_MODE:
> default:
> @@ -3328,7 +3256,7 @@ skip_ipv6_digits(char *str)
> static enum ftp_ctl_pkt
> process_ftp_ctl_v6(struct conntrack *ct,
> struct dp_packet *pkt,
> - const struct conn *conn_for_expectation,
> + const struct conn *conn_for_exp,
> union ct_addr *v6_addr_rep, char **ftp_data_start,
> size_t *addr_offset_from_ftp_data_start,
> size_t *addr_size, enum ct_alg_mode *mode)
> @@ -3396,24 +3324,25 @@ process_ftp_ctl_v6(struct conntrack *ct,
>
> switch (*mode) {
> case CT_FTP_MODE_ACTIVE:
> - *v6_addr_rep = conn_for_expectation->rev_key.dst.addr;
> + *v6_addr_rep = conn_for_exp->key_node[CT_DIR_REV].key.dst.addr;
> /* Although most servers will block this exploit, there may be some
> * less well managed. */
> if (memcmp(&ip6_addr, &v6_addr_rep->ipv6, sizeof ip6_addr) &&
> - memcmp(&ip6_addr, &conn_for_expectation->key.src.addr.ipv6,
> + memcmp(&ip6_addr,
> + &conn_for_exp->key_node[CT_DIR_FWD].key.src.addr.ipv6,
> sizeof ip6_addr)) {
> return CT_FTP_CTL_INVALID;
> }
> break;
> case CT_FTP_MODE_PASSIVE:
> - *v6_addr_rep = conn_for_expectation->key.dst.addr;
> + *v6_addr_rep = conn_for_exp->key_node[CT_DIR_FWD].key.dst.addr;
> break;
> case CT_TFTP_MODE:
> default:
> OVS_NOT_REACHED();
> }
>
> - expectation_create(ct, port, conn_for_expectation,
> + expectation_create(ct, port, conn_for_exp,
> !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
> return CT_FTP_CTL_INTEREST;
> }
> @@ -3571,7 +3500,8 @@ handle_tftp_ctl(struct conntrack *ct,
> long long now OVS_UNUSED, enum ftp_ctl_pkt ftp_ctl
> OVS_UNUSED,
> bool nat OVS_UNUSED)
> {
> - expectation_create(ct, conn_for_expectation->key.src.port,
> + expectation_create(ct,
> +
> conn_for_expectation->key_node[CT_DIR_FWD].key.src.port,
> conn_for_expectation,
> !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
> }
>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev