Re: [ovs-dev] [PATCH ovn] lflow-cache: Automatically trim cache when it becomes inactive.

Numan Siddique Tue, 30 Nov 2021 10:18:15 -0800

On Tue, Nov 30, 2021 at 12:07 PM Dumitru Ceara <[email protected]> wrote:
>
> On 11/30/21 17:54, Numan Siddique wrote:
> > On Tue, Nov 23, 2021 at 2:49 PM Dumitru Ceara <[email protected]> wrote:
> >>
> >> By default perform memory trimming 30 seconds after the lflow cache
> >> utilization has dropped.  Otherwise, idle ovn-controller processes might
> >> wastefully fail to return unused memory to the system.  The timeout is
> >> configurable through the 'ovn-trim-timeout-ms' external_id.
> >>
> >> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2024768
> >> Signed-off-by: Dumitru Ceara <[email protected]>
> >
> > Thanks for adding this support.  The patch and approach LGTM.
> >
> > I just have one question.  Please see below.
>
> Thanks for the review!
>
> >
> > Thanks
> > Numan
> >
> >> ---
> >>  NEWS                            |  3 ++
> >>  controller/chassis.c            | 16 ++++++++
> >>  controller/lflow-cache.c        | 67 ++++++++++++++++++++++++++++++++-
> >>  controller/lflow-cache.h        |  5 ++-
> >>  controller/ovn-controller.8.xml |  8 ++++
> >>  controller/ovn-controller.c     |  9 ++++-
> >>  controller/test-lflow-cache.c   | 13 +++++--
> >>  7 files changed, 114 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/NEWS b/NEWS
> >> index e27aaad06..9880a678e 100644
> >> --- a/NEWS
> >> +++ b/NEWS
> >> @@ -12,6 +12,9 @@ Post v21.09.0
> >>      running on SmartNIC control plane CPUs.  Please refer to
> >>      Documentation/topics/vif-plug-providers/vif-plug-providers.rst for 
> >> more
> >>      information.
> >> +  - ovn-controller: Add configuration knob, through OVS external-id
> >> +    "ovn-trim-timeout-ms" to allow specifiying an lflow cache inactivity
> >> +    timeout after which ovn-controller should trim memory.
> >>
> >>  OVN v21.09.0 - 01 Oct 2021
> >>  --------------------------
> >> diff --git a/controller/chassis.c b/controller/chassis.c
> >> index c11c10a34..8a1559653 100644
> >> --- a/controller/chassis.c
> >> +++ b/controller/chassis.c
> >> @@ -54,6 +54,7 @@ struct ovs_chassis_cfg {
> >>      const char *memlimit_lflow_cache;
> >>      const char *trim_limit_lflow_cache;
> >>      const char *trim_wmark_perc_lflow_cache;
> >> +    const char *trim_timeout_ms;
> >>
> >>      /* Set of encap types parsed from the 'ovn-encap-type' external-id. */
> >>      struct sset encap_type_set;
> >> @@ -163,6 +164,12 @@ get_trim_wmark_perc_lflow_cache(const struct smap 
> >> *ext_ids)
> >>      return smap_get_def(ext_ids, "ovn-trim-wmark-perc-lflow-cache", "");
> >>  }
> >>
> >> +static const char *
> >> +get_trim_timeout(const struct smap *ext_ids)
> >> +{
> >> +    return smap_get_def(ext_ids, "ovn-trim-timeout-ms", "");
> >> +}
> >> +
> >>  static const char *
> >>  get_encap_csum(const struct smap *ext_ids)
> >>  {
> >> @@ -292,6 +299,7 @@ chassis_parse_ovs_config(const struct 
> >> ovsrec_open_vswitch_table *ovs_table,
> >>          get_trim_limit_lflow_cache(&cfg->external_ids);
> >>      ovs_cfg->trim_wmark_perc_lflow_cache =
> >>          get_trim_wmark_perc_lflow_cache(&cfg->external_ids);
> >> +    ovs_cfg->trim_timeout_ms = get_trim_timeout(&cfg->external_ids);
> >>
> >>      if (!chassis_parse_ovs_encap_type(encap_type, 
> >> &ovs_cfg->encap_type_set)) {
> >>          return false;
> >> @@ -336,6 +344,7 @@ chassis_build_other_config(const struct 
> >> ovs_chassis_cfg *ovs_cfg,
> >>                   ovs_cfg->trim_limit_lflow_cache);
> >>      smap_replace(config, "ovn-trim-wmark-perc-lflow-cache",
> >>                   ovs_cfg->trim_wmark_perc_lflow_cache);
> >> +    smap_replace(config, "ovn-trim-timeout-ms", ovs_cfg->trim_timeout_ms);
> >>      smap_replace(config, "iface-types", 
> >> ds_cstr_ro(&ovs_cfg->iface_types));
> >>      smap_replace(config, "ovn-chassis-mac-mappings", 
> >> ovs_cfg->chassis_macs);
> >>      smap_replace(config, "is-interconn",
> >> @@ -415,6 +424,13 @@ chassis_other_config_changed(const struct 
> >> ovs_chassis_cfg *ovs_cfg,
> >>          return true;
> >>      }
> >>
> >> +    const char *chassis_trim_timeout_ms =
> >> +        get_trim_timeout(&chassis_rec->other_config);
> >> +
> >> +    if (strcmp(ovs_cfg->trim_timeout_ms, chassis_trim_timeout_ms)) {
> >> +        return true;
> >> +    }
> >> +
> >>      const char *chassis_mac_mappings =
> >>          get_chassis_mac_mappings(&chassis_rec->other_config);
> >>      if (strcmp(ovs_cfg->chassis_macs, chassis_mac_mappings)) {
> >> diff --git a/controller/lflow-cache.c b/controller/lflow-cache.c
> >> index 6db85fc12..9c3db06e7 100644
> >> --- a/controller/lflow-cache.c
> >> +++ b/controller/lflow-cache.c
> >> @@ -24,8 +24,10 @@
> >>  #include "coverage.h"
> >>  #include "lflow-cache.h"
> >>  #include "lib/uuid.h"
> >> +#include "openvswitch/poll-loop.h"
> >>  #include "openvswitch/vlog.h"
> >>  #include "ovn/expr.h"
> >> +#include "timeval.h"
> >>
> >>  VLOG_DEFINE_THIS_MODULE(lflow_cache);
> >>
> >> @@ -59,7 +61,10 @@ struct lflow_cache {
> >>      uint64_t max_mem_usage;
> >>      uint32_t trim_limit;
> >>      uint32_t trim_wmark_perc;
> >> +    uint32_t trim_timeout_ms;
> >>      uint64_t trim_count;
> >> +    long long int last_active_ms;
> >> +    bool recently_active;
> >>      bool enabled;
> >>  };
> >>
> >> @@ -79,6 +84,7 @@ static struct lflow_cache_value *lflow_cache_add__(
> >>  static void lflow_cache_delete__(struct lflow_cache *lc,
> >>                                   struct lflow_cache_entry *lce);
> >>  static void lflow_cache_trim__(struct lflow_cache *lc, bool force);
> >> +static void lflow_cache_record_activity__(struct lflow_cache *lc);
> >>
> >>  struct lflow_cache *
> >>  lflow_cache_create(void)
> >> @@ -128,7 +134,7 @@ lflow_cache_destroy(struct lflow_cache *lc)
> >>  void
> >>  lflow_cache_enable(struct lflow_cache *lc, bool enabled, uint32_t 
> >> capacity,
> >>                     uint64_t max_mem_usage_kb, uint32_t lflow_trim_limit,
> >> -                   uint32_t trim_wmark_perc)
> >> +                   uint32_t trim_wmark_perc, uint32_t trim_timeout_ms)
> >>  {
> >>      if (!lc) {
> >>          return;
> >> @@ -141,6 +147,13 @@ lflow_cache_enable(struct lflow_cache *lc, bool 
> >> enabled, uint32_t capacity,
> >>          trim_wmark_perc = 100;
> >>      }
> >>
> >> +    if (trim_timeout_ms < 1000) {
> >> +        VLOG_WARN_RL(&rl, "Invalid requested trim timeout: "
> >> +                     "requested %"PRIu32" ms, using 1000 ms instead",
> >> +                     trim_timeout_ms);
> >> +        trim_timeout_ms = 1000;
> >> +    }
> >> +
> >>      uint64_t max_mem_usage = max_mem_usage_kb * 1024;
> >>      bool need_flush = false;
> >>      bool need_trim = false;
> >> @@ -160,10 +173,13 @@ lflow_cache_enable(struct lflow_cache *lc, bool 
> >> enabled, uint32_t capacity,
> >>      lc->max_mem_usage = max_mem_usage;
> >>      lc->trim_limit = lflow_trim_limit;
> >>      lc->trim_wmark_perc = trim_wmark_perc;
> >> +    lc->trim_timeout_ms = trim_timeout_ms;
> >>
> >>      if (need_flush) {
> >> +        lflow_cache_record_activity__(lc);
> >>          lflow_cache_flush(lc);
> >>      } else if (need_trim) {
> >> +        lflow_cache_record_activity__(lc);
> >>          lflow_cache_trim__(lc, false);
> >>      }
> >>  }
> >> @@ -271,6 +287,7 @@ lflow_cache_delete(struct lflow_cache *lc, const 
> >> struct uuid *lflow_uuid)
> >>          lflow_cache_delete__(lc, CONTAINER_OF(lcv, struct 
> >> lflow_cache_entry,
> >>                                                value));
> >>          lflow_cache_trim__(lc, false);
> >> +        lflow_cache_record_activity__(lc);
> >>      }
> >>  }
> >>
> >> @@ -308,6 +325,46 @@ lflow_cache_get_memory_usage(const struct lflow_cache 
> >> *lc, struct simap *usage)
> >>                     ROUND_UP(lc->mem_usage, 1024) / 1024);
> >>  }
> >>
> >> +void
> >> +lflow_cache_run(struct lflow_cache *lc)
> >> +{
> >> +    if (!lc->recently_active) {
> >> +        return;
> >> +    }
> >> +
> >> +    long long int now = time_msec();
> >> +    if (now < lc->last_active_ms || now < lc->trim_timeout_ms) {
> >> +        VLOG_WARN_RL(&rl, "Detected cache last active timestamp 
> >> overflow");
> >> +        lc->recently_active = false;
> >> +        lflow_cache_trim__(lc, true);
> >> +        return;
> >> +    }
> >> +
> >> +    if (now < lc->trim_timeout_ms) {
> >> +        VLOG_WARN_RL(&rl, "Detected very large trim timeout: "
> >> +                     "now %lld ms timeout %"PRIu32" ms",
> >> +                     now, lc->trim_timeout_ms);
> >> +        return;
> >> +    }
> >> +
> >> +    if (now - lc->trim_timeout_ms >= lc->last_active_ms) {
> >> +        VLOG_INFO_RL(&rl, "Detected cache inactivity "
> >> +                    "(last active %lld ms ago): trimming cache",
> >> +                    now - lc->last_active_ms);
> >> +        lflow_cache_trim__(lc, true);
> >> +        lc->recently_active = false;
> >> +    }
> >> +}
> >> +
> >> +void
> >> +lflow_cache_wait(struct lflow_cache *lc)
> >> +{
> >> +    if (!lc->recently_active) {
> >> +        return;
> >> +    }
> >> +    poll_timer_wait_until(lc->last_active_ms + lc->trim_timeout_ms);
> >> +}
> >> +
> >>  static struct lflow_cache_value *
> >>  lflow_cache_add__(struct lflow_cache *lc, const struct uuid *lflow_uuid,
> >>                    enum lflow_cache_type type, uint64_t value_size)
> >> @@ -332,6 +389,7 @@ lflow_cache_add__(struct lflow_cache *lc, const struct 
> >> uuid *lflow_uuid,
> >>          }
> >>      }
> >>
> >> +    lflow_cache_record_activity__(lc);
> >
> > Do we need to record activity when a cache entry is added ?   If a
> > logical flow expr is cached,
> > we probably don't need to trim the memory right as we probably are not
> > freeing any memory in cache?
> > It may be sufficient to trim the memory when an entry is deleted.
> >
> > Although it's possible that processing a logical flow and adding it to
> > the cache may not free any memory in cache,
> > but ovn-controller / IDL code may have freed the memory elsewhere.  So
> > there could be benefits in trimming.
> > I'm just wondering if it would be an overkill ? What do you think ?
>
> What I was trying to achieve was something like "if there are no recent
> changes to the cache state" then try to trim the cache (and memory).
>
> The main heuristic being that if there was a recent change then the
> chance that something changes in the near future is bigger so we might
> as well delay trimming for a bit.  It's true that there might be long
> periods of churn but in that case we also have the trim watermark
> percentage to trigger trimming when the actual used cache size decreases.
>
> If we don't record activity when a cache entry is added then we'll
> actually end up trimming memory more often than we do with the proposed
> approach.


Thanks for the detailed explanation.  Makes sense.

I applied the patch to the main branch.

Numan

>
> >
> > Thanks
> > Numan
> >
>
> Thanks,
> Dumitru
>
> _______________________________________________
> dev mailing list
> [email protected]
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH ovn] lflow-cache: Automatically trim cache when it becomes inactive.

Reply via email to