On Sat, Mar 06, 2010 at 05:38:38PM +0800, xiaohui....@intel.com wrote:
> From: Xin Xiaohui <xiaohui....@intel.com>
> 
> The patch let host NIC driver to receive user space skb,
> then the driver has chance to directly DMA to guest user
> space buffers thru single ethX interface.
> 
> Signed-off-by: Xin Xiaohui <xiaohui....@intel.com>
> Signed-off-by: Zhao Yu <yzha...@gmail.com>
> Sigend-off-by: Jeff Dike <jd...@c2.user-mode-linux.org>


I have a feeling I commented on some of the below issues already.
Do you plan to send a version with comments addressed?

> ---
>  include/linux/netdevice.h |   76 ++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/skbuff.h    |   30 +++++++++++++++--
>  net/core/dev.c            |   32 ++++++++++++++++++
>  net/core/skbuff.c         |   79 
> +++++++++++++++++++++++++++++++++++++++++----
>  4 files changed, 205 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 94958c1..97bf12c 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -485,6 +485,17 @@ struct netdev_queue {
>       unsigned long           tx_dropped;
>  } ____cacheline_aligned_in_smp;
>  
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +struct mpassthru_port        {
> +     int             hdr_len;
> +     int             data_len;
> +     int             npages;
> +     unsigned        flags;
> +     struct socket   *sock;
> +     struct skb_user_page    *(*ctor)(struct mpassthru_port *,
> +                             struct sk_buff *, int);
> +};
> +#endif
>  
>  /*
>   * This structure defines the management hooks for network devices.
> @@ -636,6 +647,10 @@ struct net_device_ops {
>       int                     (*ndo_fcoe_ddp_done)(struct net_device *dev,
>                                                    u16 xid);
>  #endif
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +     int                     (*ndo_mp_port_prep)(struct net_device *dev,
> +                                             struct mpassthru_port *port);
> +#endif
>  };
>  
>  /*
> @@ -891,7 +906,8 @@ struct net_device
>       struct macvlan_port     *macvlan_port;
>       /* GARP */
>       struct garp_port        *garp_port;
> -
> +     /* mpassthru */
> +     struct mpassthru_port   *mp_port;
>       /* class/net/name entry */
>       struct device           dev;
>       /* space for optional statistics and wireless sysfs groups */
> @@ -2013,6 +2029,62 @@ static inline u32 dev_ethtool_get_flags(struct 
> net_device *dev)
>               return 0;
>       return dev->ethtool_ops->get_flags(dev);
>  }
> -#endif /* __KERNEL__ */
>  
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +static inline int netdev_mp_port_prep(struct net_device *dev,
> +             struct mpassthru_port *port)
> +{

This function lacks documentation.

> +     int rc;
> +     int npages, data_len;
> +     const struct net_device_ops *ops = dev->netdev_ops;
> +
> +     /* needed by packet split */
> +     if (ops->ndo_mp_port_prep) {
> +             rc = ops->ndo_mp_port_prep(dev, port);
> +             if (rc)
> +                     return rc;
> +     } else {  /* should be temp */
> +             port->hdr_len = 128;
> +             port->data_len = 2048;
> +             port->npages = 1;

where do the numbers come from?

> +     }
> +
> +     if (port->hdr_len <= 0)
> +             goto err;
> +
> +     npages = port->npages;
> +     data_len = port->data_len;
> +     if (npages <= 0 || npages > MAX_SKB_FRAGS ||
> +                     (data_len < PAGE_SIZE * (npages - 1) ||
> +                      data_len > PAGE_SIZE * npages))
> +             goto err;
> +
> +     return 0;
> +err:
> +     dev_warn(&dev->dev, "invalid page constructor parameters\n");
> +
> +     return -EINVAL;
> +}
> +
> +static inline int netdev_mp_port_attach(struct net_device *dev,
> +             struct mpassthru_port *port)
> +{
> +     if (rcu_dereference(dev->mp_port))
> +             return -EBUSY;
> +
> +     rcu_assign_pointer(dev->mp_port, port);
> +
> +     return 0;
> +}
> +
> +static inline void netdev_mp_port_detach(struct net_device *dev)
> +{
> +     if (!rcu_dereference(dev->mp_port))
> +             return;
> +
> +     rcu_assign_pointer(dev->mp_port, NULL);
> +     synchronize_rcu();
> +}

The above looks wrong, rcu_dereference should be called
under rcu read side, rcu_assign_pointer usually should not,
synchronize_rcu definitely should not.

As I suggested already, these functions are better opencoded,
rcu is tricky as is without hiding it in inline helpers.

> +#endif /* CONFIG_VHOST_PASSTHRU */
> +#endif /* __KERNEL__ */
>  #endif       /* _LINUX_NETDEVICE_H */
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index df7b23a..e59fa57 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -209,6 +209,13 @@ struct skb_shared_info {
>       void *          destructor_arg;
>  };
>  
> +struct skb_user_page {
> +     u8              *start;
> +     int             size;
> +     struct skb_frag_struct *frags;
> +     struct skb_shared_info *ushinfo;
> +     void            (*dtor)(struct skb_user_page *);
> +};
>  /* We divide dataref into two halves.  The higher 16 bits hold references
>   * to the payload part of skb->data.  The lower 16 bits hold references to
>   * the entire skb->data.  A clone of a headerless skb holds the length of
> @@ -441,17 +448,18 @@ extern void kfree_skb(struct sk_buff *skb);
>  extern void consume_skb(struct sk_buff *skb);
>  extern void         __kfree_skb(struct sk_buff *skb);
>  extern struct sk_buff *__alloc_skb(unsigned int size,
> -                                gfp_t priority, int fclone, int node);
> +                                gfp_t priority, int fclone,
> +                                int node, struct net_device *dev);
>  static inline struct sk_buff *alloc_skb(unsigned int size,
>                                       gfp_t priority)
>  {
> -     return __alloc_skb(size, priority, 0, -1);
> +     return __alloc_skb(size, priority, 0, -1, NULL);
>  }
>  
>  static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
>                                              gfp_t priority)
>  {
> -     return __alloc_skb(size, priority, 1, -1);
> +     return __alloc_skb(size, priority, 1, -1, NULL);
>  }
>  
>  extern int skb_recycle_check(struct sk_buff *skb, int skb_size);
> @@ -1509,6 +1517,22 @@ static inline void netdev_free_page(struct net_device 
> *dev, struct page *page)
>       __free_page(page);
>  }
>  
> +extern struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev,
> +                     struct sk_buff *skb, int npages);
> +
> +static inline struct skb_user_page *netdev_alloc_user_page(
> +             struct net_device *dev,
> +             struct sk_buff *skb, unsigned int size)
> +{
> +     struct skb_user_page *user;
> +     int npages = (size < PAGE_SIZE) ? 1 : (size / PAGE_SIZE);

Should round up to full pages?

> +
> +     user = netdev_alloc_user_pages(dev, skb, npages);
> +     if (likely(user))
> +             return user;
> +     return NULL;
> +}
> +
>  /**
>   *   skb_clone_writable - is the header of a clone writable
>   *   @skb: buffer to check
> diff --git a/net/core/dev.c b/net/core/dev.c
> index b8f74cf..ab8b082 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2265,6 +2265,30 @@ void netif_nit_deliver(struct sk_buff *skb)
>       rcu_read_unlock();
>  }
>  
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +static inline struct sk_buff *handle_mpassthru(struct sk_buff *skb,
> +                                     struct packet_type **pt_prev,
> +                                     int *ret, struct net_device *orig_dev)

please document

pt_prev, orig_dev and ret seem unused?

> +{
> +     struct mpassthru_port *ctor = NULL;

Why do you call the port "ctor"?

> +     struct sock *sk = NULL;
> +
> +     if (skb->dev)
> +             ctor = skb->dev->mp_port;
> +     if (!ctor)
> +             return skb;
> +
> +     sk = ctor->sock->sk;
> +
> +     skb_queue_tail(&sk->sk_receive_queue, skb);
> +
> +     sk->sk_data_ready(sk, skb->len);
> +     return NULL;
> +}
> +#else
> +#define handle_mpassthru(skb, pt_prev, ret, orig_dev)      (skb)
> +#endif
> +
>  /**
>   *   netif_receive_skb - process receive buffer from network
>   *   @skb: buffer to process
> @@ -2342,6 +2366,9 @@ int netif_receive_skb(struct sk_buff *skb)
>               goto out;
>  ncls:
>  #endif
> +     skb = handle_mpassthru(skb, &pt_prev, &ret, orig_dev);
> +     if (!skb)
> +             goto out;
>  
>       skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
>       if (!skb)
> @@ -2455,6 +2482,11 @@ int dev_gro_receive(struct napi_struct *napi, struct 
> sk_buff *skb)
>       if (skb_is_gso(skb) || skb_has_frags(skb))
>               goto normal;
>  
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +     if (skb->dev && skb->dev->mp_port)
> +             goto normal;
> +#endif
> +
>       rcu_read_lock();
>       list_for_each_entry_rcu(ptype, head, list) {
>               if (ptype->type != type || ptype->dev || !ptype->gro_receive)
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 80a9616..6510e5b 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -170,13 +170,15 @@ EXPORT_SYMBOL(skb_under_panic);
>   *   %GFP_ATOMIC.
>   */
>  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
> -                         int fclone, int node)
> +                         int fclone, int node, struct net_device *dev)
>  {
>       struct kmem_cache *cache;
>       struct skb_shared_info *shinfo;
>       struct sk_buff *skb;
>       u8 *data;
> -
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +     struct skb_user_page *user = NULL;
> +#endif
>       cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
>  
>       /* Get the HEAD */
> @@ -185,8 +187,26 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
> gfp_mask,
>               goto out;
>  
>       size = SKB_DATA_ALIGN(size);
> -     data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
> -                     gfp_mask, node);
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +     if (!dev || !dev->mp_port) { /* Legacy alloc func */
> +#endif
> +             data = kmalloc_node_track_caller(
> +                             size + sizeof(struct skb_shared_info),
> +                             gfp_mask, node);
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +     } else { /* Allocation may from page constructor of device */

what does the comment mean?

> +             user = netdev_alloc_user_page(dev, skb, size);
> +             if (!user) {
> +                     data = kmalloc_node_track_caller(
> +                             size + sizeof(struct skb_shared_info),
> +                             gfp_mask, node);
> +                     printk(KERN_INFO "can't alloc user buffer.\n");
> +             } else {
> +                     data = user->start;
> +                     size = SKB_DATA_ALIGN(user->size);
> +             }
> +     }
> +#endif
>       if (!data)
>               goto nodata;
>  
> @@ -208,6 +228,11 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
> gfp_mask,
>       skb->mac_header = ~0U;
>  #endif
>  
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +     if (user)
> +             memcpy(user->ushinfo, skb_shinfo(skb),
> +                             sizeof(struct skb_shared_info));
> +#endif
>       /* make sure we initialize shinfo sequentially */
>       shinfo = skb_shinfo(skb);
>       atomic_set(&shinfo->dataref, 1);
> @@ -231,6 +256,10 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
> gfp_mask,
>  
>               child->fclone = SKB_FCLONE_UNAVAILABLE;
>       }
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +     shinfo->destructor_arg = user;
> +#endif
> +
>  out:
>       return skb;
>  nodata:
> @@ -259,7 +288,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
>       int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
>       struct sk_buff *skb;
>  
> -     skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
> +     skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node, dev);
>       if (likely(skb)) {
>               skb_reserve(skb, NET_SKB_PAD);
>               skb->dev = dev;
> @@ -278,6 +307,29 @@ struct page *__netdev_alloc_page(struct net_device *dev, 
> gfp_t gfp_mask)
>  }
>  EXPORT_SYMBOL(__netdev_alloc_page);
>  
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev,
> +                     struct sk_buff *skb, int npages)
> +{
> +     struct mpassthru_port *ctor;
> +     struct skb_user_page *user = NULL;
> +
> +     rcu_read_lock();
> +     ctor = rcu_dereference(dev->mp_port);
> +     if (!ctor)
> +             goto out;
> +
> +     BUG_ON(npages > ctor->npages);
> +
> +     user = ctor->ctor(ctor, skb, npages);

With the assumption that "ctor" pins userspace pages,
can't it sleep? If yes you can't call it under rcu read side
critical section.

> +out:
> +     rcu_read_unlock();
> +
> +     return user;
> +}
> +EXPORT_SYMBOL(netdev_alloc_user_pages);
> +#endif
> +
>  void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
>               int size)
>  {
> @@ -338,6 +390,10 @@ static void skb_clone_fraglist(struct sk_buff *skb)
>  
>  static void skb_release_data(struct sk_buff *skb)
>  {
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +     struct skb_user_page *user = skb_shinfo(skb)->destructor_arg;
> +#endif
> +
>       if (!skb->cloned ||
>           !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
>                              &skb_shinfo(skb)->dataref)) {
> @@ -349,7 +405,10 @@ static void skb_release_data(struct sk_buff *skb)
>  
>               if (skb_has_frags(skb))
>                       skb_drop_fraglist(skb);
> -
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +             if (skb->dev && skb->dev->mp_port && user && user->dtor)
> +                     user->dtor(user);
> +#endif
>               kfree(skb->head);
>       }
>  }
> @@ -503,8 +562,14 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
>       if (skb_shared(skb) || skb_cloned(skb))
>               return 0;
>  
> -     skb_release_head_state(skb);
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +     if (skb->dev && skb->dev->mp_port)
> +             return 0;
> +#endif
> +
>       shinfo = skb_shinfo(skb);
> +
> +     skb_release_head_state(skb);
>       atomic_set(&shinfo->dataref, 1);
>       shinfo->nr_frags = 0;
>       shinfo->gso_size = 0;
> -- 
> 1.5.4.4
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to