When EMC hit rate goes down start start shedding load from the EMC.
---
lib/dpif-netdev.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 103 insertions(+), 4 deletions(-)
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index e3a5590..f77e79a 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -158,6 +158,13 @@ struct netdev_flow_key {
#define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \
DEFAULT_EM_FLOW_INSERT_INV_PROB)
+struct emc_shed_state {
+ unsigned long long last_hit_cnt;
+ unsigned long long last_miss_cnt;
+ unsigned long long last_skip_cnt;
+ uint32_t shed_threshold;
+};
+
struct emc_entry {
struct dp_netdev_flow *flow;
struct netdev_flow_key key; /* key.hash used for emc hash value. */
@@ -166,6 +173,7 @@ struct emc_entry {
struct emc_cache {
struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
int sweep_idx; /* For emc_cache_slow_sweep(). */
+ struct emc_shed_state emc_shed_state;
};
/* Iterate in the exact match cache through every entry that might contain a
@@ -337,6 +345,7 @@ enum dp_stat_type {
DP_STAT_LOST, /* Packets not passed up to the client. */
DP_STAT_LOOKUP_HIT, /* Number of subtable lookups for flow table
hits */
+ DP_STAT_EXACT_SKIPPED, /* Packets where EMC lookup skipped */
DP_N_STATS
};
@@ -733,6 +742,10 @@ emc_cache_init(struct emc_cache *flow_cache)
int i;
flow_cache->sweep_idx = 0;
+ flow_cache->emc_shed_state.last_hit_cnt = 0;
+ flow_cache->emc_shed_state.last_miss_cnt = 0;
+ flow_cache->emc_shed_state.last_skip_cnt = 0;
+ flow_cache->emc_shed_state.shed_threshold = 0;
for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
flow_cache->entries[i].flow = NULL;
flow_cache->entries[i].key.hash = 0;
@@ -749,6 +762,10 @@ emc_cache_uninit(struct emc_cache *flow_cache)
for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
emc_clear_entry(&flow_cache->entries[i]);
}
+ flow_cache->emc_shed_state.last_hit_cnt = 0;
+ flow_cache->emc_shed_state.last_miss_cnt = 0;
+ flow_cache->emc_shed_state.last_skip_cnt = 0;
+ flow_cache->emc_shed_state.shed_threshold = 0;
}
/* Check and clear dead flow references slowly (one entry at each
@@ -839,11 +856,28 @@ pmd_info_show_stats(struct ds *reply,
}
ds_put_cstr(reply, ":\n");
+ /* XXX some added items added here are for debug */
ds_put_format(reply,
"\temc hits:%llu\n\tmegaflow hits:%llu\n"
+ "\tshed thresh:0x%08X\n"
+ "\temc skips:%llu\n"
+ "\temc hit rate (nett) :%llu%%\n"
+ "\temc hit rate (gross):%llu%%\n"
"\tavg. subtable lookups per hit:%.2f\n"
"\tmiss:%llu\n\tlost:%llu\n",
stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
+ pmd->flow_cache.emc_shed_state.shed_threshold,
+ stats[DP_STAT_EXACT_SKIPPED],
+ (stats[DP_STAT_EXACT_HIT] + stats[DP_STAT_MASKED_HIT] -
+ stats[DP_STAT_EXACT_SKIPPED])
+ ? ((stats[DP_STAT_EXACT_HIT] * 100) /
+ (stats[DP_STAT_EXACT_HIT] + stats[DP_STAT_MASKED_HIT] -
+ stats[DP_STAT_EXACT_SKIPPED]))
+ : 0,
+ (stats[DP_STAT_EXACT_HIT] + stats[DP_STAT_MASKED_HIT])
+ ? ((stats[DP_STAT_EXACT_HIT] * 100) /
+ (stats[DP_STAT_EXACT_HIT] + stats[DP_STAT_MASKED_HIT]))
+ : 0,
stats[DP_STAT_MASKED_HIT] > 0
? (1.0*stats[DP_STAT_LOOKUP_HIT])/stats[DP_STAT_MASKED_HIT]
: 0,
@@ -1470,6 +1504,8 @@ dpif_netdev_get_stats(const struct dpif *dpif, struct
dpif_dp_stats *stats)
stats->n_hit += n;
atomic_read_relaxed(&pmd->stats.n[DP_STAT_EXACT_HIT], &n);
stats->n_hit += n;
+ atomic_read_relaxed(&pmd->stats.n[DP_STAT_EXACT_SKIPPED], &n);
+ stats->n_hit += n;
atomic_read_relaxed(&pmd->stats.n[DP_STAT_MISS], &n);
stats->n_missed += n;
atomic_read_relaxed(&pmd->stats.n[DP_STAT_LOST], &n);
@@ -4849,6 +4885,54 @@ dp_netdev_queue_batches(struct dp_packet *pkt,
packet_batch_per_flow_update(batch, pkt, mf);
}
+#define SHED_ADJ_INTERVAL_PKTS (3e6)
+#define SHED_ADJ_QUANTUM (0x10000000)
+#define SHED_THRESH_MAX (SHED_ADJ_QUANTUM + \
+ (SHED_ADJ_QUANTUM << 1) + \
+ (SHED_ADJ_QUANTUM << 2) + \
+ (SHED_ADJ_QUANTUM << 3))
+/* XXX use cost of EMC lookup & miss in cycles to replace hard bounds */
+#define SHED_HIT_RATE_LOWER_PC (50)
+#define SHED_HIT_RATE_UPPER_PC (70)
+
+
+static inline void
+adjust_emc_shedding (struct dp_netdev_pmd_thread *pmd)
+{
+ struct emc_cache *emc = &pmd->flow_cache;
+ unsigned long long emc_hit_cnt = pmd->stats.n[DP_STAT_EXACT_HIT] -
+ emc->emc_shed_state.last_hit_cnt;
+ unsigned long long emc_miss_cnt = pmd->stats.n[DP_STAT_MASKED_HIT] -
+ emc->emc_shed_state.last_miss_cnt;
+
+ if (emc_hit_cnt + emc_miss_cnt > SHED_ADJ_INTERVAL_PKTS) {
+ /* XXX protect against counter wrap around */
+ unsigned long long emc_skip_cnt = pmd->stats.n[DP_STAT_EXACT_SKIPPED] -
+ emc->emc_shed_state.last_skip_cnt;
+ unsigned long long emc_offered_cnt =
+ emc_hit_cnt + emc_miss_cnt - emc_skip_cnt;
+
+ unsigned int hit_rate_pc = (emc_hit_cnt * 100) / emc_offered_cnt;
+
+ emc->emc_shed_state.last_hit_cnt = pmd->stats.n[DP_STAT_EXACT_HIT];
+ emc->emc_shed_state.last_miss_cnt = pmd->stats.n[DP_STAT_MASKED_HIT];
+ emc->emc_shed_state.last_skip_cnt =
+ pmd->stats.n[DP_STAT_EXACT_SKIPPED];
+
+ /* As hit rate goes down shed thresh goes up (more is shed from EMC) */
+ /* XXX consider increment more if further out of bounds */
+ if (hit_rate_pc > SHED_HIT_RATE_UPPER_PC && \
+ emc->emc_shed_state.shed_threshold >= SHED_ADJ_QUANTUM) {
+ emc->emc_shed_state.shed_threshold -= SHED_ADJ_QUANTUM;
+ } else if (hit_rate_pc < SHED_HIT_RATE_LOWER_PC && \
+ emc->emc_shed_state.shed_threshold < SHED_THRESH_MAX) {
+ emc->emc_shed_state.shed_threshold += SHED_ADJ_QUANTUM;
+ }
+ }
+}
+
+
+
/* Try to process all ('cnt') the 'packets' using only the exact match cache
* 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
* miniflow is copied into 'keys' and the packet pointer is moved at the
@@ -4869,7 +4953,7 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
{
struct emc_cache *flow_cache = &pmd->flow_cache;
struct netdev_flow_key *key = &keys[0];
- size_t n_missed = 0, n_dropped = 0;
+ size_t n_missed = 0, n_dropped = 0, n_skipped = 0;
struct dp_packet *packet;
const size_t size = dp_packet_batch_size(packets_);
uint32_t cur_min;
@@ -4900,8 +4984,17 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
key->len = 0; /* Not computed yet. */
key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
+ adjust_emc_shedding(pmd);
+
/* If EMC is disabled skip emc_lookup */
- flow = (cur_min == 0) ? NULL: emc_lookup(flow_cache, key);
+ if ((key->hash > flow_cache->emc_shed_state.shed_threshold) &&
+ cur_min) {
+ flow = emc_lookup(flow_cache, key);
+ } else {
+ flow = NULL;
+ n_skipped++;
+ }
+
if (OVS_LIKELY(flow)) {
dp_netdev_queue_batches(packet, flow, &key->mf, batches,
n_batches);
@@ -4916,6 +5009,8 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
}
}
+ dp_netdev_count_packet(pmd, DP_STAT_EXACT_SKIPPED,
+ n_skipped);
dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT,
size - n_dropped - n_missed);
@@ -4986,7 +5081,9 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
add_actions->size);
}
ovs_mutex_unlock(&pmd->flow_mutex);
- emc_probabilistic_insert(pmd, key, netdev_flow);
+ if (key->hash > pmd->flow_cache.emc_shed_state.shed_threshold) {
+ emc_probabilistic_insert(pmd, key, netdev_flow);
+ }
}
}
@@ -5079,7 +5176,9 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
flow = dp_netdev_flow_cast(rules[i]);
- emc_probabilistic_insert(pmd, &keys[i], flow);
+ if (keys[i].hash > pmd->flow_cache.emc_shed_state.shed_threshold) {
+ emc_probabilistic_insert(pmd, &keys[i], flow);
+ }
dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
}
--
2.7.4
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev