On Thu, Jun 14, 2018 at 09:44:57AM -0700, John Fastabend wrote:
> First in tcp_close, reduce scope of sk_callback_lock() the lock is
> only needed for protecting smap_release_sock() the ingress and cork
> lists are protected by sock lock. Having the lock in wider scope is
> harmless but may confuse the reader who may infer it is in fact
> needed.
> 
> Next, in sock_hash_delete_elem() the pattern is as follows,
> 
>   sock_hash_delete_elem()
>      [...]
>      spin_lock(bucket_lock)
>      l = lookup_elem_raw()
>      if (l)
>         hlist_del_rcu()
>         write_lock(sk_callback_lock)
>          .... destroy psock ...
>         write_unlock(sk_callback_lock)
>      spin_unlock(bucket_lock)
> 
> The ordering is necessary because we only know the {p}sock after
> dereferencing the hash table which we can't do unless we have the
> bucket lock held. Once we have the bucket lock and the psock element
> it is deleted from the hashmap to ensure any other path doing a lookup
> will fail. Finally, the refcnt is decremented and if zero the psock
> is destroyed.
> 
> In parallel with the above (or free'ing the map) a tcp close event
> may trigger tcp_close(). Which at the moment omits the bucket lock
> altogether (oops!) where the flow looks like this,
> 
>   bpf_tcp_close()
>      [...]
>      write_lock(sk_callback_lock)
>      for each psock->maps // list of maps this sock is part of
>          hlist_del_rcu(ref_hash_node);
>          .... destroy psock ...
>      write_unlock(sk_callback_lock)
> 
> Obviously, and demonstrated by syzbot, this is broken because
> we can have multiple threads deleting entries via hlist_del_rcu().
> 
> To fix this we might be tempted to wrap the hlist operation in a
> bucket lock but that would create a lock inversion problem. In
> summary to follow locking rules maps needs the sk_callback_lock but we
> need the bucket lock to do the hlist_del_rcu. To resolve the lock
> inversion problem note that when bpf_tcp_close is called no updates
> can happen in parallel, due to ESTABLISH state check in update logic,
> so pop the head of the list repeatedly and remove the reference until
> no more are left. If a delete happens in parallel from the BPF API
> that is OK as well because it will do a similar action, lookup the
> sock in the map/hash, delete it from the map/hash, and dec the refcnt.
> We check for this case before doing a destroy on the psock to ensure
> we don't have two threads tearing down a psock. The new logic is
> as follows,
> 
>   bpf_tcp_close()
>   e = psock_map_pop(psock->maps) // done with sk_callback_lock
>   bucket_lock() // lock hash list bucket
>   l = lookup_elem_raw(head, hash, key, key_size);
>   if (l) {
>      //only get here if elmnt was not already removed
>      hlist_del_rcu()
>      ... destroy psock...
>   }
>   bucket_unlock()
> 
> And finally for all the above to work add missing sk_callback_lock
> around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise
> delete and update may corrupt maps list.
> 
> (As an aside the sk_callback_lock serves two purposes. The
>  first, is to update the sock callbacks sk_data_ready, sk_write_space,
>  etc. The second is to protect the psock 'maps' list. The 'maps' list
>  is used to (as shown above) to delete all map/hash references to a
>  sock when the sock is closed)
> 
> (If we did not have the ESTABLISHED state guarantee from tcp_close
>  then we could not ensure completion because updates could happen
>  forever and pin thread in delete loop.)
> 
> Reported-by: syzbot+0ce137753c78f7b6a...@syzkaller.appspotmail.com
> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> Signed-off-by: John Fastabend <john.fastab...@gmail.com>
> ---
>  0 files changed
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index f1ab52d..04764f5 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -258,16 +258,54 @@ static void bpf_tcp_release(struct sock *sk)
>       rcu_read_unlock();
>  }
>  
> +static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
> +                                      u32 hash, void *key, u32 key_size)
> +{
> +     struct htab_elem *l;
> +
> +     hlist_for_each_entry_rcu(l, head, hash_node) {
> +             if (l->hash == hash && !memcmp(&l->key, key, key_size))
> +                     return l;
> +     }
> +
> +     return NULL;
> +}
> +
> +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
> +{
> +     return &htab->buckets[hash & (htab->n_buckets - 1)];
> +}
> +
> +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 
> hash)
> +{
> +     return &__select_bucket(htab, hash)->head;
> +}
> +
>  static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
>  {
>       atomic_dec(&htab->count);
>       kfree_rcu(l, rcu);
>  }
>  
> +struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
> +                                        struct smap_psock *psock)
> +{
> +     struct smap_psock_map_entry *e;
> +
> +     write_lock_bh(&sk->sk_callback_lock);
> +     e = list_first_entry_or_null(&psock->maps,
> +                                  struct smap_psock_map_entry,
> +                                  list);
> +     if (e)
> +             list_del(&e->list);
> +     write_unlock_bh(&sk->sk_callback_lock);
> +     return e;
> +}
> +
>  static void bpf_tcp_close(struct sock *sk, long timeout)
>  {
>       void (*close_fun)(struct sock *sk, long timeout);
> -     struct smap_psock_map_entry *e, *tmp;
> +     struct smap_psock_map_entry *e;
>       struct sk_msg_buff *md, *mtmp;
>       struct smap_psock *psock;
>       struct sock *osk;
> @@ -286,7 +324,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
>        */
>       close_fun = psock->save_close;
>  
> -     write_lock_bh(&sk->sk_callback_lock);
>       if (psock->cork) {
>               free_start_sg(psock->sock, psock->cork);
>               kfree(psock->cork);
> @@ -299,20 +336,48 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
>               kfree(md);
>       }
>  
> -     list_for_each_entry_safe(e, tmp, &psock->maps, list) {
> +     /* Sock is in TCP_CLOSE state so any concurrent adds or updates will be
> +      * blocked by ESTABLISHED check. However, tcp_close() + delete + free
> +      * can all run at the same time. If a tcp_close + delete happens each
> +      * code path will remove the entry for the map/hash before deleting it.
> +      * In the map case a xchg and then check to verify we have a sk protects
> +      * two paths from tearing down the same object. For hash map we lock the
> +      * bucket and remove the object from the hash map before destroying to
> +      * ensure that only one reference exists. By pulling object off the head
> +      * of the list with (with sk_callback_lock) if multiple deleters are
> +      * running we avoid duplicate references.
> +      */
> +     e = psock_map_pop(sk, psock);
> +     while (e) {
>               if (e->entry) {
>                       osk = cmpxchg(e->entry, sk, NULL);
>                       if (osk == sk) {
> -                             list_del(&e->list);
>                               smap_release_sock(psock, sk);
>                       }
>               } else {
> -                     hlist_del_rcu(&e->hash_link->hash_node);
> -                     smap_release_sock(psock, e->hash_link->sk);
> -                     free_htab_elem(e->htab, e->hash_link);
> +                     struct htab_elem *link = e->hash_link;
> +                     struct hlist_head *head;
> +                     struct htab_elem *l;
> +                     struct bucket *b;
> +
> +                     b = __select_bucket(e->htab, link->hash);
> +                     head = &b->head;
> +                     raw_spin_lock_bh(&b->lock);
> +                     l = lookup_elem_raw(head,
> +                                         link->hash, link->key,
> +                                         e->htab->elem_size);
> +                     /* If another thread deleted this object skip deletion.
> +                      * The refcnt on psock may or may not be zero.
> +                      */
> +                     if (l) {
> +                             hlist_del_rcu(&e->hash_link->hash_node);
> +                             smap_release_sock(psock, e->hash_link->sk);
> +                             free_htab_elem(e->htab, e->hash_link);
> +                     }
> +                     raw_spin_unlock_bh(&b->lock);
>               }
> +             e = psock_map_pop(sk, psock);
>       }
> -     write_unlock_bh(&sk->sk_callback_lock);
>       rcu_read_unlock();
>       close_fun(sk, timeout);
>  }
> @@ -2088,16 +2153,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr 
> *attr)
>       return ERR_PTR(err);
>  }
>  
> -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
> -{
> -     return &htab->buckets[hash & (htab->n_buckets - 1)];
> -}
> -
> -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 
> hash)
> -{
> -     return &__select_bucket(htab, hash)->head;
> -}
> -
>  static void sock_hash_free(struct bpf_map *map)
>  {
>       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> @@ -2114,10 +2169,13 @@ static void sock_hash_free(struct bpf_map *map)
>        */
>       rcu_read_lock();
>       for (i = 0; i < htab->n_buckets; i++) {
> -             struct hlist_head *head = select_bucket(htab, i);
> +             struct bucket *b = __select_bucket(htab, i);
> +             struct hlist_head *head;
>               struct hlist_node *n;
>               struct htab_elem *l;
>  
> +             raw_spin_lock_bh(&b->lock);
There is a synchronize_rcu() at the beginning of sock_hash_free().
Taking the bucket's lock of the free-ing map at this point is a bit
suspicious.  What may still access the map after synchronize_rcu()?


> +             head = &b->head;
>               hlist_for_each_entry_safe(l, n, head, hash_node) {
>                       struct sock *sock = l->sk;
>                       struct smap_psock *psock;
> @@ -2137,6 +2195,7 @@ static void sock_hash_free(struct bpf_map *map)
>                       write_unlock_bh(&sock->sk_callback_lock);
>                       kfree(l);
>               }
> +             raw_spin_unlock_bh(&b->lock);
>       }
>       rcu_read_unlock();
>       bpf_map_area_free(htab->buckets);
> @@ -2167,19 +2226,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct 
> bpf_htab *htab,
>       return l_new;
>  }
>  
> -static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
> -                                      u32 hash, void *key, u32 key_size)
> -{
> -     struct htab_elem *l;
> -
> -     hlist_for_each_entry_rcu(l, head, hash_node) {
> -             if (l->hash == hash && !memcmp(&l->key, key, key_size))
> -                     return l;
> -     }
> -
> -     return NULL;
> -}
> -
>  static inline u32 htab_map_hash(const void *key, u32 key_len)
>  {
>       return jhash(key, key_len, 0);
> @@ -2307,8 +2353,10 @@ static int sock_hash_ctx_update_elem(struct 
> bpf_sock_ops_kern *skops,
>               psock = smap_psock_sk(l_old->sk);
>  
>               hlist_del_rcu(&l_old->hash_node);
> +             write_lock_bh(&l_old->sk->sk_callback_lock);
>               smap_list_remove(psock, NULL, l_old);
>               smap_release_sock(psock, l_old->sk);
> +             write_unlock_bh(&l_old->sk->sk_callback_lock);
>               free_htab_elem(htab, l_old);
>       }
>       raw_spin_unlock_bh(&b->lock);
> 

Reply via email to