The patch let host NIC driver to receive user space skb,
then the driver has chance to directly DMA to guest user
space buffers thru single ethX interface.

Signed-off-by: Xin Xiaohui <xiaohui....@intel.com>
Signed-off-by: Zhao Yu <yzha...@gmail.com>
Sigend-off-by: Jeff Dike <jd...@c2.user-mode-linux.org>
---
 include/linux/netdevice.h |   72 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/skbuff.h    |   32 ++++++++++++++++++--
 net/core/dev.c            |   27 +++++++++++++++++
 net/core/skbuff.c         |   62 +++++++++++++++++++++++++++++++++++----
 4 files changed, 184 insertions(+), 9 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 94958c1..0de8688 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -486,6 +486,16 @@ struct netdev_queue {
 } ____cacheline_aligned_in_smp;
 
 
+struct netdev_page_ctor        {
+       int             hdr_len;
+       int             data_len;
+       int             npages;
+       unsigned        flags;
+       struct socket   *sock;
+       struct skb_user_page    *(*ctor)(struct netdev_page_ctor *,
+                               struct sk_buff *, int);
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -636,6 +646,8 @@ struct net_device_ops {
        int                     (*ndo_fcoe_ddp_done)(struct net_device *dev,
                                                     u16 xid);
 #endif
+       int                     (*ndo_page_ctor_prep)(struct net_device *dev,
+                                               struct netdev_page_ctor *ctor);
 };
 
 /*
@@ -916,6 +928,7 @@ struct net_device
        /* max exchange id for FCoE LRO by ddp */
        unsigned int            fcoe_ddp_xid;
 #endif
+       struct netdev_page_ctor         *page_ctor;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -2013,6 +2026,65 @@ static inline u32 dev_ethtool_get_flags(struct 
net_device *dev)
                return 0;
        return dev->ethtool_ops->get_flags(dev);
 }
+
+static inline int netdev_page_ctor_prep(struct net_device *dev,
+               struct netdev_page_ctor *ctor)
+{
+       int rc;
+       int npages, data_len;
+       const struct net_device_ops *ops = dev->netdev_ops;
+
+       /* needed by packet split */
+       if (ops->ndo_page_ctor_prep) {
+               rc = ops->ndo_page_ctor_prep(dev, ctor);
+               if (rc)
+                       return rc;
+       } else {  /* should be temp */
+               ctor->hdr_len = 128;
+               ctor->data_len = 2048;
+               ctor->npages = 1;
+       }
+
+       if (ctor->hdr_len <= 0)
+               goto err;
+
+       npages = ctor->npages;
+       data_len = ctor->data_len;
+       if (npages <= 0 || npages > MAX_SKB_FRAGS ||
+                       (data_len < PAGE_SIZE * (npages - 1) ||
+                        data_len > PAGE_SIZE * npages))
+               goto err;
+
+       return 0;
+err:
+       dev_warn(&dev->dev, "invalid page constructor parameters\n");
+
+       return -EINVAL;
+}
+
+static inline int netdev_page_ctor_attach(struct net_device *dev,
+               struct netdev_page_ctor *ctor)
+{
+       if (dev->flags & IFF_UP)
+               return -EBUSY;
+
+       if (rcu_dereference(dev->page_ctor))
+               return -EBUSY;
+
+       rcu_assign_pointer(dev->page_ctor, ctor);
+
+       return 0;
+}
+
+static inline void netdev_page_ctor_detach(struct net_device *dev)
+{
+       if (!rcu_dereference(dev->page_ctor))
+               return;
+
+       rcu_assign_pointer(dev->page_ctor, NULL);
+       synchronize_rcu();
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_NETDEVICE_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index df7b23a..c77837e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -209,6 +209,13 @@ struct skb_shared_info {
        void *          destructor_arg;
 };
 
+struct skb_user_page {
+       u8              *start;
+       int             size;
+       struct skb_frag_struct *frags;
+       struct skb_shared_info *ushinfo;
+       void            (*dtor)(struct skb_user_page *);
+};
 /* We divide dataref into two halves.  The higher 16 bits hold references
  * to the payload part of skb->data.  The lower 16 bits hold references to
  * the entire skb->data.  A clone of a headerless skb holds the length of
@@ -441,17 +448,18 @@ extern void kfree_skb(struct sk_buff *skb);
 extern void consume_skb(struct sk_buff *skb);
 extern void           __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
-                                  gfp_t priority, int fclone, int node);
+                                  gfp_t priority, int fclone,
+                                  int node, struct net_device *dev);
 static inline struct sk_buff *alloc_skb(unsigned int size,
                                        gfp_t priority)
 {
-       return __alloc_skb(size, priority, 0, -1);
+       return __alloc_skb(size, priority, 0, -1, NULL);
 }
 
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
                                               gfp_t priority)
 {
-       return __alloc_skb(size, priority, 1, -1);
+       return __alloc_skb(size, priority, 1, -1, NULL);
 }
 
 extern int skb_recycle_check(struct sk_buff *skb, int skb_size);
@@ -1509,6 +1517,24 @@ static inline void netdev_free_page(struct net_device 
*dev, struct page *page)
        __free_page(page);
 }
 
+extern struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev,
+                       struct sk_buff *skb, int npages);
+
+extern int netdev_use_ps_feature(struct net_device *dev);
+
+static inline struct skb_user_page *netdev_alloc_user_page(
+               struct net_device *dev,
+               struct sk_buff *skb, unsigned int size)
+{
+       struct skb_user_page *user;
+       int npages = (size < PAGE_SIZE) ? 1 : (size / PAGE_SIZE);
+
+       user = netdev_alloc_user_pages(dev, skb, npages);
+       if (likely(user))
+               return user;
+       return NULL;
+}
+
 /**
  *     skb_clone_writable - is the header of a clone writable
  *     @skb: buffer to check
diff --git a/net/core/dev.c b/net/core/dev.c
index b8f74cf..9d2c2ba 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2265,6 +2265,27 @@ void netif_nit_deliver(struct sk_buff *skb)
        rcu_read_unlock();
 }
 
+static inline struct sk_buff *handle_user_space_buf(struct sk_buff *skb,
+                                       struct packet_type **pt_prev,
+                                       int *ret, struct net_device *orig_dev)
+{
+       struct netdev_page_ctor *ctor = NULL;
+       struct sock *sk = NULL;
+
+       if (skb->dev)
+               ctor = skb->dev->page_ctor;
+       if (!ctor)
+               return skb;
+
+       sk = ctor->sock->sk;
+
+       skb_queue_tail(&sk->sk_receive_queue, skb);
+
+       sk->sk_data_ready(sk, skb->len);
+       return NULL;
+}
+
+
 /**
  *     netif_receive_skb - process receive buffer from network
  *     @skb: buffer to process
@@ -2342,6 +2363,9 @@ int netif_receive_skb(struct sk_buff *skb)
                goto out;
 ncls:
 #endif
+       skb = handle_user_space_buf(skb, &pt_prev, &ret, orig_dev);
+       if (!skb)
+               goto out;
 
        skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
        if (!skb)
@@ -2455,6 +2479,9 @@ int dev_gro_receive(struct napi_struct *napi, struct 
sk_buff *skb)
        if (skb_is_gso(skb) || skb_has_frags(skb))
                goto normal;
 
+       if (skb->dev && skb->dev->page_ctor)
+               goto normal;
+
        rcu_read_lock();
        list_for_each_entry_rcu(ptype, head, list) {
                if (ptype->type != type || ptype->dev || !ptype->gro_receive)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 80a9616..40461d5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -170,12 +170,13 @@ EXPORT_SYMBOL(skb_under_panic);
  *     %GFP_ATOMIC.
  */
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
-                           int fclone, int node)
+                           int fclone, int node, struct net_device *dev)
 {
        struct kmem_cache *cache;
        struct skb_shared_info *shinfo;
        struct sk_buff *skb;
        u8 *data;
+       struct skb_user_page *user = NULL;
 
        cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
 
@@ -185,8 +186,22 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
                goto out;
 
        size = SKB_DATA_ALIGN(size);
-       data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
-                       gfp_mask, node);
+
+       if (!dev || !dev->page_ctor) { /* Legacy alloc func */
+               data = kmalloc_node_track_caller(
+                               size + sizeof(struct skb_shared_info),
+                               gfp_mask, node);
+       } else { /* Allocation may from page constructor of device */
+               user = netdev_alloc_user_page(dev, skb, size);
+               if (!user)
+                       data = kmalloc_node_track_caller(
+                               size + sizeof(struct skb_shared_info),
+                               gfp_mask, node);
+               else {
+                       data = user->start;
+                       size = SKB_DATA_ALIGN(user->size);
+               }
+       }
        if (!data)
                goto nodata;
 
@@ -208,6 +223,10 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
        skb->mac_header = ~0U;
 #endif
 
+       if (user)
+               memcpy(user->ushinfo, skb_shinfo(skb),
+                               sizeof(struct skb_shared_info));
+
        /* make sure we initialize shinfo sequentially */
        shinfo = skb_shinfo(skb);
        atomic_set(&shinfo->dataref, 1);
@@ -231,6 +250,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
 
                child->fclone = SKB_FCLONE_UNAVAILABLE;
        }
+
+       shinfo->destructor_arg = user;
+
 out:
        return skb;
 nodata:
@@ -259,7 +281,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
        int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
        struct sk_buff *skb;
 
-       skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
+       skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node, dev);
        if (likely(skb)) {
                skb_reserve(skb, NET_SKB_PAD);
                skb->dev = dev;
@@ -278,6 +300,27 @@ struct page *__netdev_alloc_page(struct net_device *dev, 
gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(__netdev_alloc_page);
 
+struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev,
+                       struct sk_buff *skb, int npages)
+{
+       struct netdev_page_ctor *ctor;
+       struct skb_user_page *user = NULL;
+
+       rcu_read_lock();
+       ctor = rcu_dereference(dev->page_ctor);
+       if (!ctor)
+               goto out;
+
+       BUG_ON(npages > ctor->npages);
+
+       user = ctor->ctor(ctor, skb, npages);
+out:
+       rcu_read_unlock();
+
+       return user;
+}
+EXPORT_SYMBOL(netdev_alloc_user_pages);
+
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
                int size)
 {
@@ -338,6 +381,8 @@ static void skb_clone_fraglist(struct sk_buff *skb)
 
 static void skb_release_data(struct sk_buff *skb)
 {
+       struct skb_user_page *user = skb_shinfo(skb)->destructor_arg;
+
        if (!skb->cloned ||
            !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
                               &skb_shinfo(skb)->dataref)) {
@@ -349,7 +394,8 @@ static void skb_release_data(struct sk_buff *skb)
 
                if (skb_has_frags(skb))
                        skb_drop_fraglist(skb);
-
+               if (skb->dev && skb->dev->page_ctor && user && user->dtor)
+                       user->dtor(user);
                kfree(skb->head);
        }
 }
@@ -503,8 +549,12 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
        if (skb_shared(skb) || skb_cloned(skb))
                return 0;
 
-       skb_release_head_state(skb);
+       if (skb->dev && skb->dev->page_ctor)
+               return 0;
+
        shinfo = skb_shinfo(skb);
+
+       skb_release_head_state(skb);
        atomic_set(&shinfo->dataref, 1);
        shinfo->nr_frags = 0;
        shinfo->gso_size = 0;
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to