On 5/13/20 3:31 PM, [email protected] wrote:
> From: Tonghao Zhang <[email protected]>
>
> For now, ovs-vswitchd use the array of the dp_meter struct
> to store meter's data, and at most, there are only 65536
> (defined by MAX_METERS) meters that can be used. But in some
> case, for example, in the edge gateway, we should use 200,000,
> at least, meters for IP address bandwidth limitation.
> Every one IP address will use two meters for its rx and tx
> path[1]. In other way, ovs-vswitchd should support meter-offload
> (rte_mtr_xxx api introduced by dpdk.), but there are more than
> 65536 meters in the hardware, such as Mellanox ConnectX-6.
>
> This patch use array to manage the meter, but it can ben expanded.
>
> [1].
> $ in_port=p0,ip,ip_dst=1.1.1.x action=meter:n,output:p1
> $ in_port=p1,ip,ip_src=1.1.1.x action=meter:m,output:p0
>
> Cc: Ilya Maximets <[email protected]>
> Cc: William Tu <[email protected]>
> Cc: Jarno Rajahalme <[email protected]>
> Cc: Ben Pfaff <[email protected]>
> Cc: Andy Zhou <[email protected]>
> Cc: Pravin Shelar <[email protected]>
> Signed-off-by: Tonghao Zhang <[email protected]>
> ---
> v2:
> * add comments for dp_meter_instance
> * change the log
> * remove extra newline
> * I don't move the dp_netdev_meter_init/destroy up. because
> them depends other meters function and put all meter function
> together may make the codes clean.
> ---
Hi. Thanks for working on this!
This is not a full review, just a few things that I spotted on a quick glance.
I didn't review any thread safety/rcu aspects yet.
Best regards, Ilya Maximets.
> lib/dpif-netdev.c | 319 ++++++++++++++++++++++++++++++++++++----------
> 1 file changed, 250 insertions(+), 69 deletions(-)
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index ef14e83b5f06..b5deaab31eb0 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -98,9 +98,12 @@ DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
>
> /* Configuration parameters. */
> enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
> -enum { MAX_METERS = 65536 }; /* Maximum number of meters. */
> -enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */
> -enum { N_METER_LOCKS = 64 }; /* Maximum number of meters. */
> +
> +/* Maximum number of meters in the table. */
> +#define METER_ENTRY_MAX (200000ULL)
> +/* Maximum number of bands / meter. */
> +#define METER_BAND_MAX (8)
> +#define DP_METER_ARRAY_SIZE_MIN (1ULL << 10)
Why we need to change enums to defines and also rename them?
>
> COVERAGE_DEFINE(datapath_drop_meter);
> COVERAGE_DEFINE(datapath_drop_upcall_error);
> @@ -283,12 +286,26 @@ struct dp_meter {
> uint16_t flags;
> uint16_t n_bands;
> uint32_t max_delta_t;
> + uint32_t id;
> + struct ovs_mutex lock;
> uint64_t used;
> uint64_t packet_count;
> uint64_t byte_count;
> struct dp_meter_band bands[];
> };
>
> +struct dp_meter_instance {
> + uint32_t n_meters;
This should be called 'n_allocated' or smething like this.
'n_meters' makes me think that it's the number of actually used meters.
> + /* Followed by struct dp_meter[n]; where n is the n_meters. */
> + OVSRCU_TYPE(struct dp_meter *) dp_meters[];
> +};
> +
> +struct dp_meter_table {
> + OVSRCU_TYPE(struct dp_meter_instance *) ti;
What does 'ti' mean? I looked throught the code it always stands for meter
instance,
but how 'meter instance' relates to 'ti'? That is confusing.
> + uint32_t count;
Why count is part of 'dp_meter_table'? I think it should be part of
'dp_meter_instance'
and named something like 'n_used', or actually 'n_meters'.
> + struct ovs_mutex lock;
> +};
Why we need this structure at all? Can it be just 3 fields inside struct
dp_netdev?
Why it is table? It's not a table. 'instance' is a table. Confusing.
> +
> struct pmd_auto_lb {
> bool auto_lb_requested; /* Auto load balancing requested by user. */
> bool is_enabled; /* Current status of Auto load balancing. */
> @@ -329,8 +346,7 @@ struct dp_netdev {
> atomic_uint32_t tx_flush_interval;
>
> /* Meters. */
> - struct ovs_mutex meter_locks[N_METER_LOCKS];
> - struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
> + struct dp_meter_table meter_tbl;
>
> /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
> OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
> @@ -378,19 +394,6 @@ struct dp_netdev {
> struct pmd_auto_lb pmd_alb;
> };
>
> -static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
> - OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
> -{
> - ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
> -}
> -
> -static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
> - OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
> -{
> - ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
> -}
> -
> -
> static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev
> *dp,
> odp_port_t)
> OVS_REQUIRES(dp->port_mutex);
> @@ -1523,6 +1526,9 @@ choose_port(struct dp_netdev *dp, const char *name)
> return ODPP_NONE;
> }
>
> +static void dp_netdev_meter_init(struct dp_meter_table *tbl);
> +static void dp_netdev_meter_destroy(struct dp_meter_table *tbl);
These functions should be named dp_netdev_meter_table_{init,destroy}.
> +
> static int
> create_dp_netdev(const char *name, const struct dpif_class *class,
> struct dp_netdev **dpp)
> @@ -1556,9 +1562,7 @@ create_dp_netdev(const char *name, const struct
> dpif_class *class,
> dp->reconfigure_seq = seq_create();
> dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
>
> - for (int i = 0; i < N_METER_LOCKS; ++i) {
> - ovs_mutex_init_adaptive(&dp->meter_locks[i]);
> - }
> + dp_netdev_meter_init(&dp->meter_tbl);
>
> /* Disable upcalls by default. */
> dp_netdev_disable_upcall(dp);
> @@ -1647,16 +1651,6 @@ dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
> fat_rwlock_destroy(&dp->upcall_rwlock);
> }
>
> -static void
> -dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
> - OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
> -{
> - if (dp->meters[meter_id]) {
> - free(dp->meters[meter_id]);
> - dp->meters[meter_id] = NULL;
> - }
> -}
> -
> /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
> * through the 'dp_netdevs' shash while freeing 'dp'. */
> static void
> @@ -1694,16 +1688,7 @@ dp_netdev_free(struct dp_netdev *dp)
> /* Upcalls must be disabled at this point */
> dp_netdev_destroy_upcall_lock(dp);
>
> - int i;
> -
> - for (i = 0; i < MAX_METERS; ++i) {
> - meter_lock(dp, i);
> - dp_delete_meter(dp, i);
> - meter_unlock(dp, i);
> - }
> - for (i = 0; i < N_METER_LOCKS; ++i) {
> - ovs_mutex_destroy(&dp->meter_locks[i]);
> - }
> + dp_netdev_meter_destroy(&dp->meter_tbl);
>
> free(dp->pmd_cmask);
> free(CONST_CAST(char *, dp->name));
> @@ -5713,14 +5698,197 @@ dp_netdev_disable_upcall(struct dp_netdev *dp)
>
>
> /* Meters */
> +static uint32_t
> +meter_hash(struct dp_meter_instance *ti, uint32_t id)
> +{
> + uint32_t n_meters = ti->n_meters;
> +
> + return id % n_meters;
> +}
Why we need a hash here in this implementation?
Below code will be broken if meter_hash will hash different ids to the
same hash value. There should be no hash or there should be good collision
protection.
> +
> +static void
> +dp_meter_free(struct dp_meter *meter)
> +{
> + ovs_mutex_destroy(&meter->lock);
> + free(meter);
> +}
> +
> +static struct dp_meter_instance *
> +dp_meter_instance_alloc(const uint32_t size)
> +{
> + struct dp_meter_instance *ti;
> +
> + ti = xzalloc(sizeof(*ti) + sizeof(struct dp_meter *) * size);
Don't parenthesize argument of sizeof if it's a variable.
> + ti->n_meters = size;
> +
> + return ti;
> +}
> +
> +static void
> +dp_meter_instance_realloc(struct dp_meter_table *tbl, const uint32_t size)
> +{
> + struct dp_meter_instance *new_ti;
> + struct dp_meter_instance *ti;
> + int n_meters;
> + int i;
> +
> + new_ti = dp_meter_instance_alloc(size);
> +
> + ti = ovsrcu_get(struct dp_meter_instance *, &tbl->ti);
> + n_meters = MIN(size, ti->n_meters);
> +
> + for (i = 0; i < n_meters; i++) {
> + if (ovsrcu_get(struct dp_meter *, &ti->dp_meters[i])) {
> + new_ti->dp_meters[i] = ti->dp_meters[i];
> + }
> + }
> +
> + ovsrcu_set(&tbl->ti, new_ti);
> + ovsrcu_postpone(free, ti);
> +}
> +
> +static void
> +dp_meter_instance_insert(struct dp_meter_instance *ti,
'dp_meter_instance_insert' sounds like we're going to create a new
dp_meter_instance and insert it to dp_meter_table, but it's not the
case.
> + struct dp_meter *meter)
> +{
> + uint32_t hash;
> +
> + hash = meter_hash(ti, meter->id);
> + ovsrcu_set(&ti->dp_meters[hash], meter);
> +}
> +
> +static void
> +dp_meter_instance_remove(struct dp_meter_instance *ti,
> + struct dp_meter *meter)
> +{
> + uint32_t hash;
> +
> + hash = meter_hash(ti, meter->id);
> + ovsrcu_set(&ti->dp_meters[hash], NULL);
> +}
> +
> +static void
> +dp_netdev_meter_init(struct dp_meter_table *tbl)
> +{
> + struct dp_meter_instance *ti;
> +
> + ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN);
> + ovsrcu_set(&tbl->ti, ti);
> +
> + ovs_mutex_init(&tbl->lock);
> + tbl->count = 0;
> +}
> +
> +static void
> +dp_netdev_meter_destroy(struct dp_meter_table *tbl)
'dp_netdev_meter_destroy' sounds like we're going to destroy a single meter.
> +{
> + struct dp_meter_instance *ti;
> + int i;
> +
> + ti = ovsrcu_get(struct dp_meter_instance *, &tbl->ti);
> + for (i = 0; i < ti->n_meters; i++) {
> + struct dp_meter *meter;
> +
> + meter = ovsrcu_get(struct dp_meter *, &ti->dp_meters[i]);
> + if (meter) {
> + ovsrcu_postpone(dp_meter_free, meter);
> + }
> + }
> +
> + ovsrcu_postpone(free, ti);
> + ovs_mutex_destroy(&tbl->lock);
> +}
> +
> +static struct dp_meter *
> +dp_meter_lookup(struct dp_meter_table *meter_tbl, uint32_t meter_id)
> +{
> + struct dp_meter_instance *ti;
> + struct dp_meter *meter;
> + uint32_t hash;
> +
> + ti = ovsrcu_get(struct dp_meter_instance *, &meter_tbl->ti);
After gitting rcu protected pointer, you have to check if it's a valid pointer.
> + hash = meter_hash(ti, meter_id);
> +
> + meter = ovsrcu_get(struct dp_meter *, &ti->dp_meters[hash]);
> + if (meter && meter->id == meter_id) {
> + return meter;
> + }
> +
> + return NULL;
> +}
> +
> +static void
> +dp_meter_detach_free(struct dp_meter_table *meter_tbl, uint32_t meter_id)
> + OVS_REQUIRES(meter_tbl->lock)
Please, keep thread safety annotations indented with 4 spaces from the left.
> +{
> + struct dp_meter_instance *ti;
> + struct dp_meter *meter;
> +
> + meter = dp_meter_lookup(meter_tbl, meter_id);
> + if (!meter) {
> + return;
> + }
> +
> + ti = ovsrcu_get(struct dp_meter_instance *, &meter_tbl->ti);
> + dp_meter_instance_remove(ti, meter);
> + ovsrcu_postpone(dp_meter_free, meter);
> +
> + meter_tbl->count--;
> + /* Shrink the meter array if necessary. */
> + if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN &&
> + meter_tbl->count <= (ti->n_meters / 4)) {
> + int half_size = ti->n_meters / 2;
> + int i;
> +
> + /* Avoid hash collision, don't move slots to other place.
> + * Make sure there are no references of meters in array
> + * which will be released.
> + */
> + for (i = half_size; i < ti->n_meters; i++) {
> + if (ovsrcu_get(struct dp_meter *, &ti->dp_meters[i])) {
> + return;
> + }
> + }
> +
> + dp_meter_instance_realloc(meter_tbl, half_size);
> + }
> +}
> +
> +static int
> +dp_meter_attach(struct dp_meter_table *meter_tbl, struct dp_meter *meter)
> + OVS_REQUIRES(meter_tbl->lock)
ditto.
> +{
> + struct dp_meter_instance *ti;
> + uint32_t hash;
> +
> + ti = ovsrcu_get(struct dp_meter_instance *, &meter_tbl->ti);
> + hash = meter_hash(ti, meter->id);
> +
> + if (OVS_UNLIKELY(ovsrcu_get(struct dp_meter *,
> + &ti->dp_meters[hash]))) {
> + VLOG_WARN("Failed to attach meter id %u to slot %u/%u.\n",
> + meter->id, hash, ti->n_meters);
> + return EBUSY;
How this could happen if you're always calling _detach_free before calling
attach?
> + }
> +
> + dp_meter_instance_insert(ti, meter);
> +
> + meter_tbl->count++;
> + if (meter_tbl->count >= ti->n_meters) {
> + dp_meter_instance_realloc(meter_tbl, ti->n_meters * 2);
> + }
> +
> + return 0;
> +}
> +
> static void
> dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
> struct ofputil_meter_features *features)
> {
> - features->max_meters = MAX_METERS;
> + features->max_meters = METER_ENTRY_MAX;
> features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
> features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
> - features->max_bands = MAX_BANDS;
> + features->max_bands = METER_BAND_MAX;
> features->max_color = 0;
> }
>
> @@ -5742,14 +5910,13 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct
> dp_packet_batch *packets_,
> uint32_t exceeded_rate[NETDEV_MAX_BURST];
> int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
>
> - if (meter_id >= MAX_METERS) {
> + if (meter_id >= METER_ENTRY_MAX) {
> return;
> }
>
> - meter_lock(dp, meter_id);
> - meter = dp->meters[meter_id];
> + meter = dp_meter_lookup(&dp->meter_tbl, meter_id);
> if (!meter) {
> - goto out;
> + return;
> }
>
> /* Initialize as negative values. */
> @@ -5757,6 +5924,7 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct
> dp_packet_batch *packets_,
> /* Initialize as zeroes. */
> memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
>
> + ovs_mutex_lock(&meter->lock);
> /* All packets will hit the meter at the same time. */
> long_delta_t = now / 1000 - meter->used / 1000; /* msec */
>
> @@ -5874,8 +6042,8 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct
> dp_packet_batch *packets_,
> dp_packet_batch_refill(packets_, packet, j);
> }
> }
> - out:
> - meter_unlock(dp, meter_id);
> +
> + ovs_mutex_unlock(&meter->lock);
> }
>
> /* Meter set/get/del processing is still single-threaded. */
> @@ -5884,11 +6052,12 @@ dpif_netdev_meter_set(struct dpif *dpif,
> ofproto_meter_id meter_id,
> struct ofputil_meter_config *config)
> {
> struct dp_netdev *dp = get_dp_netdev(dpif);
> + struct dp_meter_table *meter_tbl = &dp->meter_tbl;
> uint32_t mid = meter_id.uint32;
> struct dp_meter *meter;
> - int i;
> + int err, i;
>
> - if (mid >= MAX_METERS) {
> + if (mid >= METER_ENTRY_MAX) {
> return EFBIG; /* Meter_id out of range. */
> }
>
> @@ -5896,7 +6065,7 @@ dpif_netdev_meter_set(struct dpif *dpif,
> ofproto_meter_id meter_id,
> return EBADF; /* Unsupported flags set */
> }
>
> - if (config->n_bands > MAX_BANDS) {
> + if (config->n_bands > METER_BAND_MAX) {
> return EINVAL;
> }
>
> @@ -5917,6 +6086,8 @@ dpif_netdev_meter_set(struct dpif *dpif,
> ofproto_meter_id meter_id,
> meter->n_bands = config->n_bands;
> meter->max_delta_t = 0;
> meter->used = time_usec();
> + meter->id = mid;
> + ovs_mutex_init(&meter->lock);
>
> /* set up bands */
> for (i = 0; i < config->n_bands; ++i) {
> @@ -5942,12 +6113,22 @@ dpif_netdev_meter_set(struct dpif *dpif,
> ofproto_meter_id meter_id,
> }
> }
>
> - meter_lock(dp, mid);
> - dp_delete_meter(dp, mid); /* Free existing meter, if any */
> - dp->meters[mid] = meter;
> - meter_unlock(dp, mid);
> + ovs_mutex_lock(&meter_tbl->lock);
> +
> + dp_meter_detach_free(meter_tbl, mid); /* Free existing meter, if any */
This doesn't look correct. Why should we destroy some other meter to create
this new one?
> + err = dp_meter_attach(meter_tbl, meter);
> + if (err) {
> + goto unlock_out;
> + }
> +
> + ovs_mutex_unlock(&meter_tbl->lock);
>
> return 0;
> +
> +unlock_out:
> + ovs_mutex_unlock(&meter_tbl->lock);
> + dp_meter_free(meter);
> + return err;
> }
>
> static int
> @@ -5955,23 +6136,23 @@ dpif_netdev_meter_get(const struct dpif *dpif,
> ofproto_meter_id meter_id_,
> struct ofputil_meter_stats *stats, uint16_t n_bands)
> {
> - const struct dp_netdev *dp = get_dp_netdev(dpif);
> + struct dp_netdev *dp = get_dp_netdev(dpif);
> uint32_t meter_id = meter_id_.uint32;
> - int retval = 0;
> + const struct dp_meter *meter;
>
> - if (meter_id >= MAX_METERS) {
> + if (meter_id >= METER_ENTRY_MAX) {
> return EFBIG;
> }
>
> - meter_lock(dp, meter_id);
> - const struct dp_meter *meter = dp->meters[meter_id];
> + meter = dp_meter_lookup(&dp->meter_tbl, meter_id);
> if (!meter) {
> - retval = ENOENT;
> - goto done;
> + return ENOENT;
> }
> +
> if (stats) {
> int i = 0;
>
> + ovs_mutex_lock(&meter->lock);
> stats->packet_in_count = meter->packet_count;
> stats->byte_in_count = meter->byte_count;
>
> @@ -5979,13 +6160,12 @@ dpif_netdev_meter_get(const struct dpif *dpif,
> stats->bands[i].packet_count = meter->bands[i].packet_count;
> stats->bands[i].byte_count = meter->bands[i].byte_count;
> }
> + ovs_mutex_unlock(&meter->lock);
>
> stats->n_bands = i;
> }
>
> -done:
> - meter_unlock(dp, meter_id);
> - return retval;
> + return 0;
> }
>
> static int
> @@ -5994,15 +6174,16 @@ dpif_netdev_meter_del(struct dpif *dpif,
> struct ofputil_meter_stats *stats, uint16_t n_bands)
> {
> struct dp_netdev *dp = get_dp_netdev(dpif);
> + struct dp_meter_table *meter_tbl = &dp->meter_tbl;
> int error;
>
> error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
> if (!error) {
> uint32_t meter_id = meter_id_.uint32;
>
> - meter_lock(dp, meter_id);
> - dp_delete_meter(dp, meter_id);
> - meter_unlock(dp, meter_id);
> + ovs_mutex_lock(&meter_tbl->lock);
> + dp_meter_detach_free(meter_tbl, meter_id);
> + ovs_mutex_unlock(&meter_tbl->lock);
> }
> return error;
> }
>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev