On 4/22/20 10:36 AM, Eiichi Tsukata wrote:
> Currently classifier tries and n_tries can be updated not atomically,
> there is a race condition which can lead to NULL dereference.
> The race can happen when main thread updates a classifier tries and
> n_tries in classifier_set_prefix_fields() and at the same time revalidator
> or handler thread try to lookup them in classifier_lookup__(). Such race
> can be triggered when user changes prefixes of flow_table.
> 
> Race(user changes flow_table prefixes: ip_dst,ip_src => none):
> 
>   [main thread]             [revalidator/handler thread]
>   ===========================================================
>                             /* cls->n_tries == 2 */
>                             for (int i = 0; i < cls->n_tries; i++) {
>   trie_init(cls, i, NULL);
>   /* n_tries == 0 */
>   cls->n_tries = n_tries;
>                             /* cls->tries[i]->feild is NULL */
>                             trie_ctx_init(&trie_ctx[i],&cls->tries[i]);
>                             /* trie->field is NULL */
>                             ctx->be32ofs = trie->field->flow_be32ofs;
> 
> To prevent the race, instead of re-introducing internal mutex
> implemented in the commit fccd7c092e09 ("classifier: Remove internal
> mutex."), this patch makes trie field RCU protected and checks it after
> read.
> 
> Fixes: fccd7c092e09 ("classifier: Remove internal mutex.")
> Signed-off-by: Eiichi Tsukata <[email protected]>
> ---
>  lib/classifier.c        | 36 ++++++++++++++++++++++--------------
>  lib/classifier.h        |  6 ++++--
>  tests/test-classifier.c |  5 +++--
>  3 files changed, 29 insertions(+), 18 deletions(-)

Hi.  Thanks for the new version it seems kind of OK except few comments inline.

IIUC, the main issue this patch is trying to solve is that order of reads in
readers is opposite to what writer thinks it is, i.e. writer assumes that
'trie->field' will be accessed only after checking trie_plen, but readers reads
'trie->field' first to the local context and only after that checks the 
trie_plen.
And this patch seems to fix this issue.

But in general I'm still not convinced that the code here is thread- and 
logically
safe.  And it's really hard to read it and understand what's happening and why.
For me the most controversial part is that we're using subtable->trie_plen as
an indicator that we could use the trie and the second one is that we're 
modifying
all the non-protected fields of struct cls_trie and cls->n_tries separately 
while
it only makes sense doing that at the same time which it was whan we had a lock.

So, in long term instead of converting each filed of struct cls_trie to rcu 
protected
variable I'd like to see something like conversion of cls->tries from an array 
to an
rcu protected dynamic array like pvector.  This way we could remove n_tries 
filed
from cls and be sure that all the tries that are in our vector are in consistent
state and could actually be used.

> 
> diff --git a/lib/classifier.c b/lib/classifier.c
> index 0fad953..3d0a49e 100644
> --- a/lib/classifier.c
> +++ b/lib/classifier.c
> @@ -393,7 +393,9 @@ classifier_set_prefix_fields(struct classifier *cls,
>          bitmap_set1(fields.bm, trie_fields[i]);
>  
>          new_fields[n_tries] = NULL;
> -        if (n_tries >= cls->n_tries || field != cls->tries[n_tries].field) {
> +        const struct mf_field *cls_field
> +            = ovsrcu_get(struct mf_field *, &cls->tries[n_tries].field);
> +        if (n_tries >= cls->n_tries || field != cls_field) {
>              new_fields[n_tries] = field;
>              changed = true;
>          }
> @@ -454,7 +456,7 @@ trie_init(struct classifier *cls, int trie_idx, const 
> struct mf_field *field)
>      } else {
>          ovsrcu_set_hidden(&trie->root, NULL);
>      }
> -    trie->field = field;
> +    ovsrcu_set(&trie->field, field);

Why not ovsrcu_set_hidden?

>  
>      /* Add existing rules to the new trie. */
>      CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
> @@ -849,7 +851,6 @@ static void
>  trie_ctx_init(struct trie_ctx *ctx, const struct cls_trie *trie)
>  {
>      ctx->trie = trie;
> -    ctx->be32ofs = trie->field->flow_be32ofs;

Shouldn't we remove 'be32ofs' filed from 'struct trie_ctx'?

>      ctx->lookup_done = false;
>  }
>  
> @@ -1531,8 +1532,10 @@ insert_subtable(struct classifier *cls, const struct 
> minimask *mask)
>      *CONST_CAST(uint8_t *, &subtable->n_indices) = index;
>  
>      for (i = 0; i < cls->n_tries; i++) {
> -        subtable->trie_plen[i] = minimask_get_prefix_len(mask,
> -                                                         
> cls->tries[i].field);
> +        const struct mf_field *field
> +            = ovsrcu_get(struct mf_field *, &cls->tries[i].field);
> +        subtable->trie_plen[i]
> +            = field ? minimask_get_prefix_len(mask, field) : 0;
>      }
>  
>      /* Ports trie. */
> @@ -1577,8 +1580,10 @@ check_tries(struct trie_ctx trie_ctx[CLS_MAX_TRIES], 
> unsigned int n_tries,
>      for (j = 0; j < n_tries; j++) {
>          /* Is the trie field relevant for this subtable, and
>             is the trie field within the current range of fields? */
> -        if (field_plen[j] &&
> -            flowmap_is_set(&range_map, trie_ctx[j].be32ofs / 2)) {
> +        const struct mf_field *ctx_field
> +            = ovsrcu_get(struct mf_field *, &trie_ctx[j].trie->field);
> +        if (field_plen[j] && ctx_field &&
> +            flowmap_is_set(&range_map, ctx_field->flow_be32ofs / 2)) {
>              struct trie_ctx *ctx = &trie_ctx[j];
>  
>              /* On-demand trie lookup. */
> @@ -1601,14 +1606,16 @@ check_tries(struct trie_ctx trie_ctx[CLS_MAX_TRIES], 
> unsigned int n_tries,
>                   * than this subtable would otherwise. */
>                  if (ctx->maskbits <= field_plen[j]) {
>                      /* Unwildcard the bits and skip the rest. */
> -                    mask_set_prefix_bits(wc, ctx->be32ofs, ctx->maskbits);
> +                    mask_set_prefix_bits(wc, ctx_field->flow_be32ofs,
> +                                         ctx->maskbits);
>                      /* Note: Prerequisite already unwildcarded, as the only
>                       * prerequisite of the supported trie lookup fields is
>                       * the ethertype, which is always unwildcarded. */
>                      return true;
>                  }
>                  /* Can skip if the field is already unwildcarded. */
> -                if (mask_prefix_bits_set(wc, ctx->be32ofs, ctx->maskbits)) {
> +                if (mask_prefix_bits_set(wc, ctx_field->flow_be32ofs,
> +                                         ctx->maskbits)) {
>                      return true;
>                  }
>              }
> @@ -2001,12 +2008,12 @@ static unsigned int
>  trie_lookup(const struct cls_trie *trie, const struct flow *flow,
>              union trie_prefix *plens)
>  {
> -    const struct mf_field *mf = trie->field;
> +    const struct mf_field *mf = ovsrcu_get(struct mf_field *, &trie->field);
>  
>      /* Check that current flow matches the prerequisites for the trie
>       * field.  Some match fields are used for multiple purposes, so we
>       * must check that the trie is relevant for this flow. */
> -    if (mf_are_prereqs_ok(mf, flow, NULL)) {
> +    if (mf && mf_are_prereqs_ok(mf, flow, NULL)) {
>          return trie_lookup_value(&trie->root,
>                                   &((ovs_be32 *)flow)[mf->flow_be32ofs],
>                                   &plens->be32, mf->n_bits);
> @@ -2053,8 +2060,9 @@ minimask_get_prefix_len(const struct minimask *minimask,
>   * happened to be zeros.
>   */
>  static const ovs_be32 *
> -minimatch_get_prefix(const struct minimatch *match, const struct mf_field 
> *mf)
> +minimatch_get_prefix(const struct minimatch *match, rcu_field_ptr *field)
>  {
> +    struct mf_field *mf = ovsrcu_get_protected(struct mf_field *, field);
>      size_t u64_ofs = mf->flow_be32ofs / 2;
>  
>      return (OVS_FORCE const ovs_be32 *)miniflow_get__(match->flow, u64_ofs)
> @@ -2068,7 +2076,7 @@ static void
>  trie_insert(struct cls_trie *trie, const struct cls_rule *rule, int mlen)
>  {
>      trie_insert_prefix(&trie->root,
> -                       minimatch_get_prefix(&rule->match, trie->field), 
> mlen);
> +                       minimatch_get_prefix(&rule->match, &trie->field), 
> mlen);
>  }
>  
>  static void
> @@ -2123,7 +2131,7 @@ static void
>  trie_remove(struct cls_trie *trie, const struct cls_rule *rule, int mlen)
>  {
>      trie_remove_prefix(&trie->root,
> -                       minimatch_get_prefix(&rule->match, trie->field), 
> mlen);
> +                       minimatch_get_prefix(&rule->match, &trie->field), 
> mlen);
>  }
>  
>  /* 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
> diff --git a/lib/classifier.h b/lib/classifier.h
> index d1bd4aa..f646a8f 100644
> --- a/lib/classifier.h
> +++ b/lib/classifier.h
> @@ -314,13 +314,15 @@ extern "C" {
>  struct cls_subtable;
>  struct cls_match;
>  
> +struct mf_field;
> +typedef OVSRCU_TYPE(struct mf_field *) rcu_field_ptr;
>  struct trie_node;
>  typedef OVSRCU_TYPE(struct trie_node *) rcu_trie_ptr;
>  
>  /* Prefix trie for a 'field' */
>  struct cls_trie {
> -    const struct mf_field *field; /* Trie field, or NULL. */
> -    rcu_trie_ptr root;            /* NULL if none. */
> +    rcu_field_ptr field;   /* Trie field, or NULL. */
> +    rcu_trie_ptr root;     /* NULL if none. */
>  };
>  
>  enum {
> diff --git a/tests/test-classifier.c b/tests/test-classifier.c
> index 6d53d01..2d98fad 100644
> --- a/tests/test-classifier.c
> +++ b/tests/test-classifier.c
> @@ -512,8 +512,9 @@ verify_tries(struct classifier *cls)
>      int i;
>  
>      for (i = 0; i < cls->n_tries; i++) {
> -        n_rules += trie_verify(&cls->tries[i].root, 0,
> -                               cls->tries[i].field->n_bits);
> +        const struct mf_field * cls_field
> +            = ovsrcu_get(struct mf_field *, &cls->tries[i].field);
> +        n_rules += trie_verify(&cls->tries[i].root, 0, cls_field->n_bits);
>      }
>      assert(n_rules <= cls->n_rules);
>  }
> 

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to