From: Xin, Xiaohui<xiaohui....@intel.com>

How external buffer comes from, how to destroy.

Signed-off-by: Xin Xiaohui <xiaohui....@intel.com>
Signed-off-by: Zhao Yu <yzhao81...@gmail.com>
Reviewed-by: Jeff Dike <jd...@linux.intel.com>
---
 drivers/vhost/mpassthru.c |  253 ++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 251 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index 25e2f3e..de07f1e 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -161,6 +161,39 @@ static int mp_dev_change_flags(struct net_device *dev, 
unsigned flags)
        return ret;
 }
 
+/* The main function to allocate external buffers */
+static struct skb_external_page *page_ctor(struct mpassthru_port *port,
+               struct sk_buff *skb, int npages)
+{
+       int i;
+       unsigned long flags;
+       struct page_ctor *ctor;
+       struct page_info *info = NULL;
+
+       ctor = container_of(port, struct page_ctor, port);
+
+       spin_lock_irqsave(&ctor->read_lock, flags);
+       if (!list_empty(&ctor->readq)) {
+               info = list_first_entry(&ctor->readq, struct page_info, list);
+               list_del(&info->list);
+       }
+       spin_unlock_irqrestore(&ctor->read_lock, flags);
+       if (!info)
+               return NULL;
+
+       for (i = 0; i < info->pnum; i++) {
+               get_page(info->pages[i]);
+               info->frag[i].page = info->pages[i];
+               info->frag[i].page_offset = i ? 0 : info->offset;
+               info->frag[i].size = port->npages > 1 ? PAGE_SIZE :
+                       port->data_len;
+       }
+       info->skb = skb;
+       info->ext_page.frags = info->frag;
+       info->ext_page.ushinfo = &info->ushinfo;
+       return &info->ext_page;
+}
+
 static int page_ctor_attach(struct mp_struct *mp)
 {
        int rc;
@@ -186,7 +219,7 @@ static int page_ctor_attach(struct mp_struct *mp)
 
        dev_hold(dev);
        ctor->dev = dev;
-       ctor->port.ctor = NULL;
+       ctor->port.ctor = page_ctor;
        ctor->port.sock = &mp->socket;
        ctor->lock_pages = 0;
        rc = netdev_mp_port_attach(dev, &ctor->port);
@@ -252,11 +285,66 @@ static int set_memlock_rlimit(struct page_ctor *ctor, int 
resource,
        return 0;
 }
 
+static void relinquish_resource(struct page_ctor *ctor)
+{
+       if (!(ctor->dev->flags & IFF_UP) &&
+                       !(ctor->wq_len + ctor->rq_len))
+               printk(KERN_INFO "relinquish_resource\n");
+}
+
+static void mp_ki_dtor(struct kiocb *iocb)
+{
+       struct page_info *info = (struct page_info *)(iocb->private);
+       int i;
+
+       if (info->flags == INFO_READ) {
+               for (i = 0; i < info->pnum; i++) {
+                       if (info->pages[i]) {
+                               set_page_dirty_lock(info->pages[i]);
+                               put_page(info->pages[i]);
+                       }
+               }
+               info->skb->destructor = NULL;
+               kfree_skb(info->skb);
+               info->ctor->rq_len--;
+       } else
+               info->ctor->wq_len--;
+       /* Decrement the number of locked pages */
+       info->ctor->lock_pages -= info->pnum;
+       kmem_cache_free(ext_page_info_cache, info);
+       relinquish_resource(info->ctor);
+
+       return;
+}
+
+static struct kiocb *create_iocb(struct page_info *info, int size)
+{
+       struct kiocb *iocb = NULL;
+
+       iocb = info->iocb;
+       if (!iocb)
+               return iocb;
+       iocb->ki_flags = 0;
+       iocb->ki_users = 1;
+       iocb->ki_key = 0;
+       iocb->ki_ctx = NULL;
+       iocb->ki_cancel = NULL;
+       iocb->ki_retry = NULL;
+       iocb->ki_iovec = NULL;
+       iocb->ki_eventfd = NULL;
+       iocb->ki_pos = info->desc_pos;
+       iocb->ki_nbytes = size;
+       iocb->ki_dtor(iocb);
+       iocb->private = (void *)info;
+       iocb->ki_dtor = mp_ki_dtor;
+
+       return iocb;
+}
+
 static int page_ctor_detach(struct mp_struct *mp)
 {
        struct page_ctor *ctor;
        struct page_info *info;
-       struct kiocb *iocb = NULL;
        int i;
 
        /* locked by mp_mutex */
@@ -268,11 +356,17 @@ static int page_ctor_detach(struct mp_struct *mp)
                for (i = 0; i < info->pnum; i++)
                        if (info->pages[i])
                                put_page(info->pages[i]);
+               create_iocb(info, 0);
+               ctor->rq_len--;
                kmem_cache_free(ext_page_info_cache, info);
        }
+
+       relinquish_resource(ctor);
+
        set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
                           ctor->o_rlim.rlim_cur,
                           ctor->o_rlim.rlim_max);
+
        netdev_mp_port_detach(ctor->dev);
        dev_put(ctor->dev);
 
@@ -320,6 +414,161 @@ static void mp_put(struct mp_file *mfile)
                mp_detach(mfile->mp);
 }
 
+/* The callback to destruct the external buffers or skb */
+static void page_dtor(struct skb_external_page *ext_page)
+{
+       struct page_info *info;
+       struct page_ctor *ctor;
+       struct sock *sk;
+       struct sk_buff *skb;
+       unsigned long flags;
+
+       if (!ext_page)
+               return;
+       info = container_of(ext_page, struct page_info, ext_page);
+       if (!info)
+               return;
+       ctor = info->ctor;
+       skb = info->skb;
+
+       if ((info->flags == INFO_READ) && info->skb)
+               info->skb->head = NULL;
+
+       /* If the info->total is 0, make it to be reused */
+       if (!info->total) {
+               spin_lock_irqsave(&ctor->read_lock, flags);
+               list_add(&info->list, &ctor->readq);
+               spin_unlock_irqrestore(&ctor->read_lock, flags);
+               return;
+       }
+
+       if (info->flags == INFO_READ)
+               return;
+
+       /* For transmit, we should wait for the DMA finish by hardware.
+        * Queue the notifier to wake up the backend driver
+        */
+
+       create_iocb(info, info->total);
+
+       sk = ctor->port.sock->sk;
+       sk->sk_write_space(sk);
+
+       return;
+}
+
+/* For small exteranl buffers transmit, we don't need to call
+ * get_user_pages().
+ */
+static struct page_info *alloc_small_page_info(struct page_ctor *ctor,
+               struct kiocb *iocb, int total)
+{
+       struct page_info *info =
+               kmem_cache_zalloc(ext_page_info_cache, GFP_KERNEL);
+
+       if (!info)
+               return NULL;
+       info->total = total;
+       info->ext_page.dtor = page_dtor;
+       info->ctor = ctor;
+       info->flags = INFO_WRITE;
+       info->iocb = iocb;
+       return info;
+}
+
+/* The main function to transform the guest user space address
+ * to host kernel address via get_user_pages(). Thus the hardware
+ * can do DMA directly to the external buffer address.
+ */
+static struct page_info *alloc_page_info(struct page_ctor *ctor,
+               struct kiocb *iocb, struct iovec *iov,
+               int count, struct frag *frags,
+               int npages, int total)
+{
+       int rc;
+       int i, j, n = 0;
+       int len;
+       unsigned long base, lock_limit;
+       struct page_info *info = NULL;
+
+       lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+       lock_limit >>= PAGE_SHIFT;
+
+       if (ctor->lock_pages + count > lock_limit) {
+               printk(KERN_INFO "exceed the locked memory rlimit.");
+               return NULL;
+       }
+
+       info = kmem_cache_zalloc(ext_page_info_cache, GFP_KERNEL);
+
+       if (!info)
+               return NULL;
+
+       for (i = j = 0; i < count; i++) {
+               base = (unsigned long)iov[i].iov_base;
+               len = iov[i].iov_len;
+
+               if (!len)
+                       continue;
+               n = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+
+               rc = get_user_pages_fast(base, n, npages ? 1 : 0,
+                               &info->pages[j]);
+               if (rc != n)
+                       goto failed;
+
+               while (n--) {
+                       frags[j].offset = base & ~PAGE_MASK;
+                       frags[j].size = min_t(int, len,
+                                       PAGE_SIZE - frags[j].offset);
+                       len -= frags[j].size;
+                       base += frags[j].size;
+                       j++;
+               }
+       }
+
+#ifdef CONFIG_HIGHMEM
+       if (npages && !(dev->features & NETIF_F_HIGHDMA)) {
+               for (i = 0; i < j; i++) {
+                       if (PageHighMem(info->pages[i]))
+                               goto failed;
+               }
+       }
+#endif
+
+       info->total = total;
+       info->ext_page.dtor = page_dtor;
+       info->ctor = ctor;
+       info->pnum = j;
+       info->iocb = iocb;
+       if (!npages)
+               info->flags = INFO_WRITE;
+       if (info->flags == INFO_READ) {
+               info->ext_page.start = (u8 *)(((unsigned long)
+                               (pfn_to_kaddr(page_to_pfn(info->pages[0]))) +
+                               frags[0].offset));
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+               info->ext_page.size = SKB_DATA_ALIGN(
+                               iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD);
+#else
+               info->ext_page.size = SKB_DATA_ALIGN(
+                               iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD) -
+                       NET_IP_ALIGN - NET_SKB_PAD;
+#endif
+       }
+       /* increment the number of locked pages */
+       ctor->lock_pages += j;
+       return info;
+
+failed:
+       for (i = 0; i < j; i++)
+               put_page(info->pages[i]);
+
+       kmem_cache_free(ext_page_info_cache, info);
+
+       return NULL;
+}
+
 /* Ops structure to mimic raw sockets with mp device */
 static const struct proto_ops mp_socket_ops = {
 };
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to