Quoting r. Libor Michalek <[EMAIL PROTECTED]>:
> Subject: Re: [PATCH] sdp: replace mlock with get_user_pages
> 
> On Thu, May 05, 2005 at 02:01:58PM +0300, Michael S. Tsirkin wrote:
> > Hello, Libor!
> > The following patch replaces the mlock hack with call
> > to get_user_pages. Since the application could have forked
> > while an iocb is outstanding, when an iocb is done
> > I do get_user_pages for a second time and copy data if
> > the physical address has changed.
> > 
> > Thus, changing ulimit is no longer required to get aio
> > working, processes are also allowed to fork and to call mlock/munlock
> > on the buffer.
> > 
> > Tested by the ttcp.aio benchmark, and works fine for me.
> 
>   In the latest kernel what happens to a page that's held
> with a get_user_page reference on fork? Which is the only
> case left. Since there is an iocv flag parameter it would
> be prefered to use a bit there instead of is_receive. Also,
> please leave spaces around '='. Finally it would be nice
> if we could get rid of having to schedule a task to unlock
> the iocb in complete, I've noticed this having a negative
> effect on system CPU utilization on older kernels.
> 
> -Libor

I dont see what would prevent the page
from being copied on a COW event. Do you? Certainly the patch in 2.6.7
that fixed the swap-out problem does not affect COW.
With regard to avoiding task scheduling, since current code already
needs this, so we are not breaking anything.
I put in comments so that we remember to do this research at some point
in time.

In the patch below the style issues are addressed: whitespace fixed,
is_receive changed to flag.

OK to check in?

Signed-off-by: Michael S. Tsirkin <[EMAIL PROTECTED]>

Index: ulp/sdp/sdp_send.c
===================================================================
--- ulp/sdp/sdp_send.c  (revision 2292)
+++ ulp/sdp/sdp_send.c  (working copy)
@@ -2197,7 +2197,7 @@ skip: /* entry point for IOCB based tran
                iocb->req  = req;
                iocb->key  = req->ki_key;
                iocb->addr = (unsigned long)msg->msg_iov->iov_base - copied;
-      
+
                req->ki_cancel = sdp_inet_write_cancel;
 
                result = sdp_iocb_lock(iocb);
Index: ulp/sdp/sdp_recv.c
===================================================================
--- ulp/sdp/sdp_recv.c  (revision 2292)
+++ ulp/sdp/sdp_recv.c  (working copy)
@@ -1449,6 +1449,8 @@ int sdp_inet_recv(struct kiocb  *req, st
                        iocb->addr = ((unsigned long)msg->msg_iov->iov_base -
                                      copied);
 
+                       iocb->flags |= SDP_IOCB_F_RECV;
+
                        req->ki_cancel = sdp_inet_read_cancel;
 
                        result = sdp_iocb_lock(iocb);
Index: ulp/sdp/sdp_iocb.c
===================================================================
--- ulp/sdp/sdp_iocb.c  (revision 2292)
+++ ulp/sdp/sdp_iocb.c  (working copy)
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -31,389 +32,196 @@
  *
  * $Id$
  */
-
+#include <linux/pagemap.h>
 #include "sdp_main.h"
 
 static kmem_cache_t *sdp_iocb_cache = NULL;
 
-/*
- * memory locking functions
- */
-#include <linux/utsname.h>
-
-typedef int (*do_mlock_ptr_t)(unsigned long, size_t, int);
-static do_mlock_ptr_t mlock_ptr = NULL;
-
-/*
- * do_iocb_unlock - unlock the memory for an IOCB
- */
-static int do_iocb_unlock(struct sdpc_iocb *iocb)
-{
-       struct vm_area_struct *vma;
-
-       vma = find_vma(iocb->mm, (iocb->addr & PAGE_MASK));
-       if (!vma)
-               sdp_warn("No VMA for IOCB <%lx:%Zu> unlock",
-                        iocb->addr, iocb->size);
-
-       while (vma) {
-               sdp_dbg_data(NULL, 
-                            "unmark <%lx> <%p> <%08lx:%08lx> <%08lx> <%ld>",
-                            iocb->addr, vma, vma->vm_start, vma->vm_end,
-                            vma->vm_flags, (long)vma->vm_private_data);
-               
-               spin_lock(&iocb->mm->page_table_lock);
-               /*
-                * if there are no more references to the vma
-                */
-               vma->vm_private_data--;
- 
-               if (!vma->vm_private_data) {
-                       /*
-                        * modify VM flags.
-                        */
-                       vma->vm_flags &= ~(VM_DONTCOPY|VM_LOCKED);
-                       /*
-                        * adjust locked page count
-                        */
-                       vma->vm_mm->locked_vm -= ((vma->vm_end -
-                                                  vma->vm_start) >> 
-                                                 PAGE_SHIFT);
-               }
-
-               spin_unlock(&iocb->mm->page_table_lock);
-               /*
-                * continue if the buffer continues onto the next vma
-                */
-               if ((iocb->addr + iocb->size) > vma->vm_end)
-                       vma = vma->vm_next;
-               else
-                       vma = NULL;
-       }
-
-       return 0;
+static void sdp_copy_one_page(struct page *from, struct page* to, 
+                     unsigned long iocb_addr, size_t iocb_size,
+                     unsigned long uaddr)
+{
+       size_t size_left = iocb_addr + iocb_size - uaddr;
+       size_t size = min(size_left,PAGE_SIZE);
+       unsigned long offset = uaddr % PAGE_SIZE;
+       unsigned long flags;
+
+       void* fptr;
+       void* tptr;
+
+       local_irq_save(flags);
+       fptr = kmap_atomic(from, KM_IRQ0);
+       tptr = kmap_atomic(to, KM_IRQ1);
+
+       memcpy(tptr + offset, fptr + offset, size);
+
+       kunmap_atomic(tptr, KM_IRQ1);
+       kunmap_atomic(fptr, KM_IRQ0);
+       local_irq_restore(flags);
+       set_page_dirty_lock(to);
 }
 
 /*
  * sdp_iocb_unlock - unlock the memory for an IOCB
+ * Copy if pages moved since.
+ * TODO: is this needed?
  */
 int sdp_iocb_unlock(struct sdpc_iocb *iocb)
 {
        int result;
+       struct page ** pages = NULL;
+       unsigned long uaddr;
+       int i;
+
 
-       /*
-        * check if IOCB is locked.
-        */
        if (!(iocb->flags & SDP_IOCB_F_LOCKED))
                return 0;
-       /*
-        * spin lock since this could be from interrupt context.
-        */
-       down_write(&iocb->mm->mmap_sem);
-       
-       result = do_iocb_unlock(iocb);
 
-       up_write(&iocb->mm->mmap_sem);
+       /* For read, unlock and we are done */
+       if (!(iocb->flags & SDP_IOCB_F_RECV)) {
+               for (i = 0;i < iocb->page_count; ++i)
+                       put_page(iocb->page_array[i]);
+               goto done;
+       }
 
-       kfree(iocb->page_array);
-       kfree(iocb->addr_array);
+       /* For write, we must check the virtual pages did not get remapped */
 
-       iocb->page_array = NULL;
-       iocb->addr_array = NULL;
-       iocb->mm = NULL;
-       /*
-        * mark IOCB unlocked.
-        */
-       iocb->flags &= ~SDP_IOCB_F_LOCKED;
+       /* As an optimisation (to avoid scanning the vma tree each time),
+        * try to get all pages in one go. */
+       /* TODO: use cache for allocations? Allocate by chunks? */
 
-       return result;
-}
+       pages = kmalloc((sizeof(struct page *) * iocb->page_count), GFP_KERNEL);
 
-/*
- * sdp_iocb_page_save - save page information for an IOCB
- */
-static int sdp_iocb_page_save(struct sdpc_iocb *iocb)
-{
-       unsigned int counter;
-       unsigned long addr;
-       size_t size;
-       int result = -ENOMEM;
-       struct page *page;
-       unsigned long pfn;
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *ptep;
-       pte_t  pte;
+       down_read(&iocb->mm->mmap_sem);
 
-       if (iocb->page_count <= 0 || iocb->size <= 0 || !iocb->addr)
-               return -EINVAL;
-       /*
-        * create array to hold page value which are later needed to register
-        * the buffer with the HCA
-        */
-       iocb->addr_array = kmalloc((sizeof(u64) * iocb->page_count),
-                                  GFP_KERNEL);
-       if (!iocb->addr_array)
-               goto err_addr;
+       if (pages) {
+               result = get_user_pages(iocb->tsk, iocb->mm, iocb->addr,
+                                       iocb->page_count, 1, 0, pages, NULL);
 
-       iocb->page_array = kmalloc((sizeof(struct page *) * iocb->page_count), 
-                                  GFP_KERNEL);
-       if (!iocb->page_array)
-               goto err_page;
-       /*
-        * iocb->addr - buffer start address
-        * iocb->size - buffer length
-        * addr       - page aligned
-        * size       - page multiple
-        */
-       addr = iocb->addr & PAGE_MASK;
-       size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
+               if (result != iocb->page_count) {
+                       kfree(pages);
+                       pages = NULL;
+               }
+       }
 
-       iocb->page_offset = iocb->addr - addr;
-       /*
-        * Find pages used within the buffer which will then be registered
-        * for RDMA
-        */
-       spin_lock(&iocb->mm->page_table_lock);
+       for (i = 0, uaddr = iocb->addr; i < iocb->page_count;
+            ++i, uaddr = (uaddr & PAGE_MASK) + PAGE_SIZE)
+       {
+               struct page* page;
+               set_page_dirty_lock(iocb->page_array[i]);
+
+               if (pages)
+                       page = pages[i];
+               else {
+                       result = get_user_pages(iocb->tsk, iocb->mm,
+                                               uaddr & PAGE_MASK,
+                                               1 , 1, 0, &page, NULL);
+                       if (result != 1) {
+                               page = NULL;
+                       }
+               }
 
-       for (counter = 0;
-            size > 0;
-            counter++, addr += PAGE_SIZE, size -= PAGE_SIZE) {
-               pgd = pgd_offset_gate(iocb->mm, addr);
-               if (!pgd || pgd_none(*pgd))
-                       break;
-
-               pud = pud_offset(pgd, addr);
-               if (!pud || pud_none(*pud))
-                       break;
-               
-               pmd = pmd_offset(pud, addr);
-               if (!pmd || pmd_none(*pmd))
-                       break;
-
-               ptep = pte_offset_map(pmd, addr);
-               if (!ptep) 
-                       break;
-
-               pte = *ptep;
-               pte_unmap(ptep);
-
-               if (!pte_present(pte))
-                       break;
-
-               pfn = pte_pfn(pte);
-               if (!pfn_valid(pfn))
-                       break;
-        
-               page = pfn_to_page(pfn);
+               if (page && iocb->page_array[i] != page)
+                       sdp_copy_one_page(iocb->page_array[i], page,
+                                         iocb->addr, iocb->size, uaddr);
 
-               iocb->page_array[counter] = page;
-               iocb->addr_array[counter] = page_to_phys(page);
+               if (page)
+                       put_page(page);
+               put_page(iocb->page_array[i]);
        }
 
-       spin_unlock(&iocb->mm->page_table_lock);
-       
-       if (size > 0) {
-               result = -EFAULT;
-               goto err_find;
-       }
+       up_read(&iocb->mm->mmap_sem);
 
-       return 0;
-err_find:
-       
-       kfree(iocb->page_array);
-       iocb->page_array = NULL;
-err_page:
+       if (pages)
+               kfree(pages);
 
+done:
+       kfree(iocb->page_array);
        kfree(iocb->addr_array);
+
+       iocb->page_array = NULL;
        iocb->addr_array = NULL;
-err_addr:
+       iocb->mm = NULL;
+       iocb->tsk = NULL;
 
-       return result;
+       iocb->flags &= ~SDP_IOCB_F_LOCKED;
+
+       return 0;
 }
 
 /*
  * sdp_iocb_lock - lock the memory for an IOCB
+ * We do not take a reference on the mm, AIO handles this for us.
  */
 int sdp_iocb_lock(struct sdpc_iocb *iocb)
 {
-       struct vm_area_struct *vma;
-       kernel_cap_t real_cap;
-       unsigned long limit;
        int result = -ENOMEM;
        unsigned long addr;
-       size_t        size;
-       
-       /*
-        * mark IOCB as locked. We do not take a reference on the mm, AIO
-        * handles this for us.
-        */
-       iocb->flags |= SDP_IOCB_F_LOCKED;
-       iocb->mm     = current->mm;
+       size_t size;
+       int i;
        /*
-        * save and raise capabilities
+        * iocb->addr - buffer start address
+        * iocb->size - buffer length
+        * addr       - page aligned
+        * size       - page multiple
         */
-       real_cap = cap_t(current->cap_effective);
-       cap_raise(current->cap_effective, CAP_IPC_LOCK);
-  
-       size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
        addr = iocb->addr & PAGE_MASK;
+       size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
 
+       iocb->page_offset = iocb->addr - addr;
+       
        iocb->page_count = size >> PAGE_SHIFT;
-
-       limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
-       limit >>= PAGE_SHIFT;
-       /*
-        * lock the mm, if within the limit lock the address range.
-        */
-       down_write(&iocb->mm->mmap_sem);
-
-       if (!((iocb->page_count + current->mm->locked_vm) > limit))
-               result = (*mlock_ptr)(addr, size, 1);
        /*
-        * process result
-        */
-       if (result) {
-               sdp_dbg_err("VMA lock <%lx:%Zu> error <%d> <%d:%lu:%lu>",
-                           iocb->addr, iocb->size, result, 
-                           iocb->page_count, iocb->mm->locked_vm, limit);
-               goto err_lock;
-       }
-       /*
-        * look up the head of the vma queue, loop through the vmas, marking
-        * them do not copy, reference counting, and saving them. 
+        * create array to hold page value which are later needed to register
+        * the buffer with the HCA
         */
-       vma = find_vma(iocb->mm, addr);
-       if (!vma)
-               /*
-                * sanity check.
-                */
-               sdp_warn("No VMA for IOCB! <%lx:%Zu> lock",
-                        iocb->addr, iocb->size);
 
-       while (vma) {
-               spin_lock(&iocb->mm->page_table_lock);
+       /* TODO: use cache for allocations? Allocate by chunks? */
+       iocb->addr_array = kmalloc((sizeof(u64) * iocb->page_count),
+                                  GFP_KERNEL);
+       if (!iocb->addr_array)
+               goto err_addr;
 
-               if (!(VM_LOCKED & vma->vm_flags))
-                       sdp_warn("Unlocked vma! <%08lx>", vma->vm_flags);
+       iocb->page_array = kmalloc((sizeof(struct page *) * iocb->page_count), 
+                                  GFP_KERNEL);
+       if (!iocb->page_array)
+               goto err_page;
 
-               if (PAGE_SIZE < (unsigned long)vma->vm_private_data)
-                       sdp_dbg_err("VMA: private daya in use! <%08lx>",
-                                   (unsigned long)vma->vm_private_data);
-               
-               vma->vm_flags |= VM_DONTCOPY;
-               vma->vm_private_data++;
+       down_read(&current->mm->mmap_sem);
 
-               spin_unlock(&iocb->mm->page_table_lock);
+        result = get_user_pages(current, current->mm,
+                               iocb->addr, iocb->page_count,
+                             !!(iocb->flags & SDP_IOCB_F_RECV), 0,
+                             iocb->page_array, NULL);
+
+       up_read(&current->mm->mmap_sem);
+
+       if (result != iocb->page_count) {
+               sdp_dbg_err("unable to lock <%lx:%Zu> error <%d> <%d>",
+                           iocb->addr, iocb->size, result, iocb->page_count);
+               goto err_get;
+       }
 
-               sdp_dbg_data(NULL, 
-                            "mark <%lx> <0x%p> <%08lx:%08lx> <%08lx> <%ld>",
-                            iocb->addr, vma, vma->vm_start, vma->vm_end,
-                            vma->vm_flags, (long)vma->vm_private_data);
+       iocb->flags |= SDP_IOCB_F_LOCKED;
+       iocb->mm     = current->mm;
+       iocb->tsk    = current;
 
-               if ((addr + size) > vma->vm_end)
-                       vma = vma->vm_next;
-               else
-                       vma = NULL;
-       }
 
-       result = sdp_iocb_page_save(iocb);
-       if (result) {
-               sdp_dbg_err("Error <%d> saving pages for IOCB <%lx:%Zu>", 
-                           result, iocb->addr, iocb->size);
-               goto err_save;
+       for (i = 0; i< iocb->page_count; ++i) {
+               iocb->addr_array[i] = page_to_phys(iocb->page_array[i]);
        }
 
-       up_write(&iocb->mm->mmap_sem);
-       cap_t(current->cap_effective) = real_cap;
-
        return 0;
-err_save:
-
-       (void)do_iocb_unlock(iocb);
-err_lock:
-       /*
-        * unlock the mm and restore capabilities.
-        */
-       up_write(&iocb->mm->mmap_sem);
-       cap_t(current->cap_effective) = real_cap;
-
-       iocb->flags &= ~SDP_IOCB_F_LOCKED;
-       iocb->mm = NULL;
 
+err_get:
+       kfree(iocb->page_array);
+err_page:
+       kfree(iocb->addr_array);
+err_addr:
        return result;
 }
 
 /*
- * IOCB memory locking init functions
- */
-struct kallsym_iter {
-       loff_t pos;
-       struct module *owner;
-       unsigned long value;
-       unsigned int nameoff; /* If iterating in core kernel symbols */
-       char type;
-       char name[128];
-};
-
-/*
- * sdp_mem_lock_init - initialize the userspace memory locking
- */
-static int sdp_mem_lock_init(void)
-{
-       struct file *kallsyms;
-       struct seq_file *seq;
-       struct kallsym_iter *iter;
-       loff_t pos = 0;
-       int ret = -EINVAL;
-       
-       sdp_dbg_init("Memory Locking initialization.");
-       
-       kallsyms = filp_open("/proc/kallsyms", O_RDONLY, 0);
-       if (!kallsyms) {
-               sdp_warn("Failed to open /proc/kallsyms");
-               goto done;
-       }
-
-       seq = (struct seq_file *)kallsyms->private_data;
-       if (!seq) {
-               sdp_warn("Failed to fetch sequential file.");
-               goto err_close;
-       }
-
-       for (iter = seq->op->start(seq, &pos);
-            iter != NULL;
-            iter = seq->op->next(seq, iter, &pos))
-               if (!strcmp(iter->name, "do_mlock"))
-                       mlock_ptr = (do_mlock_ptr_t)iter->value;
-
-       if (!mlock_ptr)
-               sdp_warn("Failed to find lock pointer.");
-       else
-               ret = 0;
-
-err_close:
-       filp_close(kallsyms, NULL);
-done:
-       return ret;
-}
-
-/*
- * sdp_mem_lock_cleanup - cleanup the memory locking tables
- */
-static int sdp_mem_lock_cleanup(void)
-{
-       sdp_dbg_init("Memory Locking cleanup.");
-       /*
-        * null out entries.
-        */
-       mlock_ptr = NULL;
-       
-       return 0;
-}
-
-/*
  * IOCB memory registration functions
  */
 
@@ -831,28 +639,12 @@ void sdp_iocb_q_clear(struct sdpc_iocb_q
 }
 
 /*
- * primary initialization/cleanup functions
- */
-
-/*
  * sdp_main_iocb_init - initialize the advertisment caches
  */
 int sdp_main_iocb_init(void)
 {
-       int result;
-
        sdp_dbg_init("IOCB cache initialization.");
-       /*
-        * initialize locking code.
-        */
-       result =  sdp_mem_lock_init();
-       if (result < 0) {
-               sdp_warn("Error <%d> initializing memory locking.", result);
-               return result;
-       }
-       /*
-        * initialize the caches only once.
-        */
+
        if (sdp_iocb_cache) {
                sdp_warn("IOCB caches already initialized.");
                return -EINVAL;
@@ -862,15 +654,10 @@ int sdp_main_iocb_init(void)
                                             sizeof(struct sdpc_iocb),
                                             0, SLAB_HWCACHE_ALIGN, NULL,
                                             NULL);
-       if (!sdp_iocb_cache) {
-               result = -ENOMEM;
-               goto error_iocb_c;
-       }
+       if (!sdp_iocb_cache)
+               return -ENOMEM;
 
        return 0;
-error_iocb_c:
-       (void)sdp_mem_lock_cleanup();
-       return result;
 }
 
 /*
@@ -879,16 +666,6 @@ error_iocb_c:
 void sdp_main_iocb_cleanup(void)
 {
        sdp_dbg_init("IOCB cache cleanup.");
-       /*
-        * cleanup the caches
-        */
        kmem_cache_destroy(sdp_iocb_cache);
-       /*
-        * null out entries.
-        */
        sdp_iocb_cache = NULL;
-       /*
-        * cleanup memory locking
-        */
-       (void)sdp_mem_lock_cleanup();
 }
Index: ulp/sdp/sdp_iocb.h
===================================================================
--- ulp/sdp/sdp_iocb.h  (revision 2292)
+++ ulp/sdp/sdp_iocb.h  (working copy)
@@ -52,7 +52,8 @@
 #define SDP_IOCB_F_RDMA_R 0x00000010 /* IOCB is in RDMA read processing */
 #define SDP_IOCB_F_RDMA_W 0x00000020 /* IOCB is in RDMA write processing */
 #define SDP_IOCB_F_LOCKED 0x00000040 /* IOCB is locked in memory */
-#define SDP_IOCB_F_REG    0x00000080 /* IOCB is memory is registered */
+#define SDP_IOCB_F_REG    0x00000080 /* IOCB memory is registered */
+#define SDP_IOCB_F_RECV   0x00000100 /* IOCB is for a receive request */
 #define SDP_IOCB_F_ALL    0xFFFFFFFF /* IOCB all mask */
 /*
  * zcopy constants.
@@ -99,9 +100,10 @@ struct sdpc_iocb {
        /*
         * page list. data for locking/registering userspace
         */
-       struct mm_struct *mm;      /* user mm struct */
-       unsigned long     addr;    /* user space address */
-       size_t            size;    /* total size of the user buffer */
+       struct mm_struct   *mm;      /* user mm struct */
+       struct task_struct *tsk;
+       unsigned long       addr;    /* user space address */
+       size_t              size;    /* total size of the user buffer */
 
        struct page **page_array;  /* list of page structure pointers. */
        u64          *addr_array;  /* list of physical page addresses. */
-- 
MST - Michael S. Tsirkin
_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to