When OvS-DPDK is set up as a firewall, the overall performance drops
significantly. This performance drop can be attributed to two reasons.

(1) Recirculation.
(2) Bottleneck inside CT module.

This RFC patch is an attempt to mitigate the bottleneck with CT module[2].
When a connection is established some of its relevant info is saved in
EMC and this info is used to track the incoming packets of the same
connection so that the CT module can be skipped for these packets.

The current implementation is done for UDP connections due to its stateless
nature. A UDP packet that is hitting an established connection will be sent
through the CT module only if timeout elapses. Otherwise it will skip the CT
module there by saving precious CPU cycles. Significant performance
improvement is observed with UDP connections.

Note that implementation is conditioned for UDP as skipping the CT
module breaks the TCP sequence checking logic inside 'tcp_conn_update'
for TCP connections.

We would like to receive feedback from the CT experts on the overall
approach before making more code changes. Also it would be nice
to know any corner-cases that would be affected with the proposed RFC.

CC: Darrell Ball <[email protected]>
CC: Joe Stringer <[email protected]>
Signed-off-by: Antonio Fischetti <[email protected]>
Co-authored-by: Bhanuprakash Bodireddy <[email protected]>
Signed-off-by: Bhanuprakash Bodireddy <[email protected]>

---
 - No testing is done with S-NAT, D-NAT cases.
 - The timeout value in CT_CONN_REFRESH_TIMEOUT_MS is an arbitrary value.
 - A further step could be to store some other infos, eg the hash
   values and take advantage of that when processing pkts of the
   same connection.
 - Below the performance jump we saw in our Firewall test-bench.

    SETUP
    =====
table=0, priority=1 actions=drop
table=0, priority=10,arp actions=NORMAL
table=0, priority=100,ct_state=-trk,ip actions=ct(table=1)
table=1, ct_state=+new+trk,ip,in_port=dpdk0 actions=ct(commit),output:dpdk1
table=1, ct_state=+new+trk,ip,in_port=dpdk1 actions=drop
table=1, ct_state=+est+trk,ip,in_port=dpdk0 actions=output:dpdk1
table=1, ct_state=+est+trk,ip,in_port=dpdk1 actions=output:dpdk0

IXIA: UDP Flows, 64B pkts, bidirectional test.
PDM threads:  2

Code built with: CFLAGS="-O2 -march=native -g"

    MEASUREMENTS
    ============
I measured packet Rx rate in Mpps (regardless of packet loss)
for each Rx side.

Orig means Commit ID:
    b9fedfa61f000f49500973d2a51e99a80d9cf9b8

  Flows  | Orig [Mpps] | +patch [Mpps]
 --------+-------------+----------------
  1      | 1.77, 1.72  |  5.97, 6.59
  10     | 2.35, 2.35  |  6.04, 6.28
  100    | 2.51, 2.54  |  5.29, 5.31
  1000   | 2.11, 2.14  |  3.81, 3.83
  10000  | 1.67, 1.70  |  2.01, 1.99
  100000 | 1.49, 1.52  |  1.51, 1.54
---
 lib/conntrack.c   | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/dpif-netdev.c | 13 +++++++--
 lib/packets.h     |  1 +
 3 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 419cb1d..b26fb68 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -53,6 +53,18 @@ struct conn_lookup_ctx {
     bool icmp_related;
 };
 
+struct emc_est_conn_info {
+    long long conn_refresh;     /* Dead-line to refresh connection. */
+    uint8_t  ct_state;          /* Connection state. */
+    uint32_t ct_mark;           /* Connection mark. */
+    ovs_u128 ct_label;          /* Connection label. */
+    ovs_be32 ipv4_src;
+    ovs_be32 ipv4_dst;
+    ovs_be16 src_port;
+    ovs_be16 dst_port;
+    uint8_t  ipv4_proto;
+};
+
 enum ftp_ctl_pkt {
     /* Control packets with address and/or port specifiers. */
     CT_FTP_CTL_INTEREST,
@@ -1123,6 +1135,46 @@ process_one(struct conntrack *ct, struct dp_packet *pkt,
     }
 }
 
+static inline void
+update_pkt_fields(struct dp_packet *packet,
+                  struct emc_est_conn_info *conn_shared_info,
+                  bool update_pkt_md)
+{
+    if (update_pkt_md) {
+        packet->md.ct_state = conn_shared_info->ct_state;
+        packet->md.ct_mark = conn_shared_info->ct_mark;
+        packet->md.ct_label = conn_shared_info->ct_label;
+        packet->md.ct_orig_tuple.ipv4.ipv4_src =
+                        conn_shared_info->ipv4_src;
+        packet->md.ct_orig_tuple.ipv4.ipv4_dst =
+                        conn_shared_info->ipv4_dst;
+        packet->md.ct_orig_tuple.ipv4.src_port =
+                        conn_shared_info->src_port;
+        packet->md.ct_orig_tuple.ipv4.dst_port =
+                        conn_shared_info->dst_port;
+        packet->md.ct_orig_tuple.ipv4.ipv4_proto =
+                        conn_shared_info->ipv4_proto;
+    } else {
+        conn_shared_info->ct_state = packet->md.ct_state;
+        conn_shared_info->ct_mark = packet->md.ct_mark;
+        conn_shared_info->ct_label = packet->md.ct_label;
+        conn_shared_info->ipv4_src =
+                        packet->md.ct_orig_tuple.ipv4.ipv4_src;
+        conn_shared_info->ipv4_dst =
+                        packet->md.ct_orig_tuple.ipv4.ipv4_dst;
+        conn_shared_info->src_port =
+                        packet->md.ct_orig_tuple.ipv4.src_port;
+        conn_shared_info->dst_port =
+                        packet->md.ct_orig_tuple.ipv4.dst_port;
+        conn_shared_info->ipv4_proto =
+                        packet->md.ct_orig_tuple.ipv4.ipv4_proto;
+
+        *(struct emc_est_conn_info **)(packet->md.est_conn_info) =
+                        conn_shared_info;
+    }
+}
+
+#define CT_CONN_REFRESH_TIMEOUT_MS 10000LL
 /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'.  All
  * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
  * the l3 and and l4 offset properly set.
@@ -1144,8 +1196,26 @@ conntrack_execute(struct conntrack *ct, struct 
dp_packet_batch *pkt_batch,
     struct dp_packet **pkts = pkt_batch->packets;
     size_t cnt = pkt_batch->count;
     struct conn_lookup_ctx ctx;
+    struct emc_est_conn_info *conn_shared_info = NULL;
 
     for (size_t i = 0; i < cnt; i++) {
+
+        if (!force && pkts[i]->md.est_conn_info &&
+            (conn_shared_info =
+                *(struct emc_est_conn_info **)(pkts[i]->md.est_conn_info))) {
+            /* UDP */
+            if (conn_shared_info->ipv4_proto == IPPROTO_UDP) {
+                if (now > conn_shared_info->conn_refresh) {
+                    conn_shared_info->conn_refresh =
+                        now + CT_CONN_REFRESH_TIMEOUT_MS;
+                } else {
+                    pkts[i]->md.ct_zone = zone;
+                    update_pkt_fields(pkts[i], conn_shared_info, true);
+                    continue;
+                }
+            }
+        }
+
         if (!conn_key_extract(ct, pkts[i], dl_type, &ctx, zone)) {
             pkts[i]->md.ct_state = CS_INVALID;
             write_ct_md(pkts[i], zone, NULL, NULL, NULL);
@@ -1153,6 +1223,20 @@ conntrack_execute(struct conntrack *ct, struct 
dp_packet_batch *pkt_batch,
         }
         process_one(ct, pkts[i], &ctx, zone, force, commit,
                     now, setmark, setlabel, nat_action_info, helper);
+
+        if (pkts[i]->md.est_conn_info) {
+            /* There's an EMC entry that was hit by this packet. */
+            if (pkts[i]->md.ct_state & CS_ESTABLISHED) {
+                if (!conn_shared_info) {
+                    /* XXX: Free up this mem location once conn. terminates. */
+                    conn_shared_info =
+                            xzalloc(sizeof(struct emc_est_conn_info));
+                    update_pkt_fields(pkts[i], conn_shared_info, false);
+                    conn_shared_info->conn_refresh =
+                            now + CT_CONN_REFRESH_TIMEOUT_MS;
+                }
+            }
+        }
     }
 
     return 0;
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 0ceef9d..4ce101f 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -160,6 +160,9 @@ struct netdev_flow_key {
 
 struct emc_entry {
     struct dp_netdev_flow *flow;
+    void *est_conn_info;        /* Infos for established connections.  It is
+                                   set when the corresponding L4 connection is
+                                   established. */
     struct netdev_flow_key key;   /* key.hash used for emc hash value. */
 };
 
@@ -2123,6 +2126,7 @@ emc_change_entry(struct emc_entry *ce, struct 
dp_netdev_flow *flow,
     if (key) {
         netdev_flow_key_clone(&ce->key, key);
     }
+    ce->est_conn_info = 0;
 }
 
 static inline void
@@ -2172,7 +2176,8 @@ emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
 }
 
 static inline struct dp_netdev_flow *
-emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
+emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key,
+           void **conn_info)
 {
     struct emc_entry *current_entry;
 
@@ -2181,6 +2186,8 @@ emc_lookup(struct emc_cache *cache, const struct 
netdev_flow_key *key)
             && emc_entry_alive(current_entry)
             && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
 
+            *conn_info = &current_entry->est_conn_info;
+
             /* We found the entry with the 'key->mf' miniflow */
             return current_entry->flow;
         }
@@ -4902,7 +4909,9 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
         key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
 
         /* If EMC is disabled skip emc_lookup */
-        flow = (cur_min == 0) ? NULL: emc_lookup(flow_cache, key);
+        flow = (cur_min == 0) ? NULL:
+                      emc_lookup(flow_cache, key, &packet->md.est_conn_info);
+
         if (OVS_LIKELY(flow)) {
             dp_netdev_queue_batches(packet, flow, &key->mf, batches,
                                     n_batches);
diff --git a/lib/packets.h b/lib/packets.h
index 705d0b2..f25972a 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -101,6 +101,7 @@ PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
                                    action. */
     uint32_t skb_priority;      /* Packet priority for QoS. */
     uint32_t pkt_mark;          /* Packet mark. */
+    void *est_conn_info;        /* CT info for established L4 connection. */
     uint8_t  ct_state;          /* Connection state. */
     bool ct_orig_tuple_ipv6;
     uint16_t ct_zone;           /* Connection zone. */
-- 
2.4.11

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to