[EMAIL PROTECTED] wrote on Wed, 18 Oct 2006 16:21 -0500:
> Rob pointed out that even with one buffer pointer passed into  
> PVFS_sys_io, if the request is non-contiguous, the offsets of the  
> request could just have been calculated from different buffer  
> pointers.  So we don't know how many separate buffers are being used  
> over the memory request, except that there's at most as many buffers  
> as contiguous regions in the request.  The PINT_process_request code  
> (potentially) breaks those buffers up even further though, based on  
> the distribution parameters (like strip size), before passing the  
> pointers to bmi.  It seems like we could do what you're suggesting,  
> but we would have to do it per each contiguous region of the  
> request.  Maybe that's not such a big deal?  Not sure...

Good point.  I bailed and didn't bother with the hint in the case
the memory request is non-contiguous.  It's only an optimization,
not a requirement, and for a highly fragmented memory request, this
may be more expensive than it's worth.

I also didn't bother with any expilicit deregistration.  Since we
are just caching, not adding any user-visible memory management API,
we won't know if or when the user will use that buffer again.  So we
just leave it up to the registration cache manager to decide when to
toss it.  Which isn't actually implemented at the moment---it just
holds onto everything forever.

Here's a patch against CVS head.  Let me know if you see anything
funny.  (There's a couple other fixits that snuck in that don't
matter.)  Brett, feel free to try this out.  It should cut the number
of memory registrations from 900-odd down to 1-odd in your case.

                -- Pete

Index: src/client/sysint/sys-io.sm
===================================================================
RCS file: /projects/cvsroot/pvfs2/src/client/sysint/sys-io.sm,v
retrieving revision 1.152
diff -u -p -r1.152 sys-io.sm
--- src/client/sysint/sys-io.sm 26 Sep 2006 03:44:16 -0000      1.152
+++ src/client/sysint/sys-io.sm 19 Oct 2006 17:19:39 -0000
@@ -1674,6 +1674,23 @@ static inline int io_post_flow(
     attr = &sm_p->getattr.attr;
     assert(attr);
     
+    /*
+     * Notify BMI about the memory buffer the user passed in.  For transports
+     * that need registration, this allows them to work with one large region
+     * rather than lots of small stripe-size regions.  But only bother if the
+     * request is contiguous; too complex and likely no faster in the highly
+     * fragmented case.
+     */
+    if (sm_p->u.io.mem_req->num_contig_chunks == 1)
+    {
+        struct bmi_optimistic_buffer_info binfo;
+
+        binfo.buffer = sm_p->u.io.buffer;
+        binfo.len = PINT_REQUEST_TOTAL_BYTES(sm_p->u.io.mem_req),
+        binfo.rw = sm_p->u.io.io_type;
+        BMI_set_info(cur_ctx->msg.svr_addr, BMI_OPTIMISTIC_BUFFER_REG, &binfo);
+    }
+
     gossip_debug(GOSSIP_IO_DEBUG, "* mem req size is %lld, "
                  "file_req size is %lld (bytes)\n",
                  lld(PINT_REQUEST_TOTAL_BYTES(sm_p->u.io.mem_req)),
Index: src/io/bmi/bmi-types.h
===================================================================
RCS file: /projects/cvsroot/pvfs2/src/io/bmi/bmi-types.h,v
retrieving revision 1.28
diff -u -p -r1.28 bmi-types.h
--- src/io/bmi/bmi-types.h      11 Sep 2006 15:42:38 -0000      1.28
+++ src/io/bmi/bmi-types.h      19 Oct 2006 17:19:39 -0000
@@ -76,6 +76,15 @@ enum
     BMI_TCP_BUFFER_SEND_SIZE = 11,
     BMI_TCP_BUFFER_RECEIVE_SIZE = 12,
     BMI_TCP_CLOSE_SOCKET = 13,
+    BMI_OPTIMISTIC_BUFFER_REG = 14,
+};
+
+/** used to describe a memory region in passing down a registration
+ * hint from IO routines. */
+struct bmi_optimistic_buffer_info {
+    const void *buffer;
+    PVFS_size len;
+    enum PVFS_io_type rw;
 };
 
 /* mappings from PVFS errors to BMI errors */
Index: src/io/bmi/bmi_ib/ib.c
===================================================================
RCS file: /projects/cvsroot/pvfs2/src/io/bmi/bmi_ib/ib.c,v
retrieving revision 1.46
diff -u -p -r1.46 ib.c
--- src/io/bmi/bmi_ib/ib.c      12 Oct 2006 20:37:28 -0000      1.46
+++ src/io/bmi/bmi_ib/ib.c      19 Oct 2006 17:19:39 -0000
@@ -231,7 +231,7 @@ static int ib_check_cq(void)
        } else if (wc.opcode == BMI_IB_OP_SEND) {
 
            /* periodic send queue flush, qp or qp_ack */
-           debug(2, "%s: send to %s completed locally", __func__,
+           debug(2, "%s: send (or ack) to %s completed locally", __func__,
              ((ib_connection_t *) ptr_from_int64(wc.id))->peername);
 
        } else {
@@ -1264,6 +1264,7 @@ BMI_ib_testcontext(int incount, bmi_op_i
        /* if time since last activity is > 10ms, block */
        if (now.tv_sec > 0 || now.tv_usec > 10000) {
            /* block */
+           debug(2, "%s: last activity too long ago, blocking", __func__);
            n = ib_block_for_activity(max_idle_time);
            if (n)
                gettimeofday(&last_action, 0);  /* had some action */
@@ -1835,6 +1836,12 @@ static int BMI_ib_set_info(int option, v
        ib_method_addr_t *ibmap = map->method_data;
        free(ibmap->hostname);
        free(map);
+       break;
+    }
+    case BMI_OPTIMISTIC_BUFFER_REG: {
+       const struct bmi_optimistic_buffer_info *binfo = param;
+       memcache_preregister(ib_device->memcache, binfo->buffer,
+                            binfo->len, binfo->rw);
        break;
     }
     default:
Index: src/io/bmi/bmi_ib/ib.h
===================================================================
RCS file: /projects/cvsroot/pvfs2/src/io/bmi/bmi_ib/ib.h,v
retrieving revision 1.24
diff -u -p -r1.24 ib.h
--- src/io/bmi/bmi_ib/ib.h      12 Oct 2006 20:37:27 -0000      1.24
+++ src/io/bmi/bmi_ib/ib.h      19 Oct 2006 17:19:39 -0000
@@ -397,6 +397,8 @@ int write_full(int fd, const void *buf, 
 void *memcache_memalloc(void *md, bmi_size_t len, int eager_limit);
 int memcache_memfree(void *md, void *buf, bmi_size_t len);
 void memcache_register(void *md, ib_buflist_t *buflist);
+void memcache_preregister(void *md, const void *buf, bmi_size_t len,
+                          enum PVFS_io_type rw);
 void memcache_deregister(void *md, ib_buflist_t *buflist);
 void *memcache_init(void (*mem_register)(memcache_entry_t *),
                     void (*mem_deregister)(memcache_entry_t *));
Index: src/io/bmi/bmi_ib/mem.c
===================================================================
RCS file: /projects/cvsroot/pvfs2/src/io/bmi/bmi_ib/mem.c,v
retrieving revision 1.9
diff -u -p -r1.9 mem.c
--- src/io/bmi/bmi_ib/mem.c     13 Sep 2006 23:11:21 -0000      1.9
+++ src/io/bmi/bmi_ib/mem.c     19 Oct 2006 17:19:39 -0000
@@ -21,6 +21,10 @@
  * This internal state structure is allocated when the init function
  * is called.  The device hangs onto it and gives it back to us as
  * needed.
+ *
+ * TODO: Use an rbtree here instead.  Also deregister refcnt==0 regions
+ * when new ones come along that overlap, much like dreg, as an indication
+ * that application buffers have changed.
  */
 typedef struct {
     list_t list;
@@ -208,6 +212,36 @@ memcache_register(void *md, ib_buflist_t
     gen_mutex_unlock(&memcache_device->mutex);
 }
 
+/*
+ * Similar to the normal register call, but does not use a buflist,
+ * just adds an entry to the cache for use by later registrations.
+ * Also does not add a refcnt on any entry.
+ */
+void memcache_preregister(void *md, const void *buf, bmi_size_t len,
+                          enum PVFS_io_type rw __unused)
+{
+#if ENABLE_MEMCACHE
+    memcache_device_t *memcache_device = md;
+    memcache_entry_t *c;
+
+    gen_mutex_lock(&memcache_device->mutex);
+    c = memcache_lookup_cover(memcache_device, buf, len);
+    if (c) {
+       debug(2, "%s: hit %p len %lld (via %p len %lld) refcnt now %d",
+             __func__, buf, lld(len), c->buf, lld(c->len), c->count);
+    } else {
+       debug(2, "%s: miss %p len %lld", __func__, buf, lld(len));
+       c = memcache_add(memcache_device, (void *)(uintptr_t) buf, len);
+       if (!c)
+           error("%s: no memory for cache entry", __func__);
+       (memcache_device->mem_register)(c);
+    }
+    gen_mutex_unlock(&memcache_device->mutex);
+#else
+    md; buf; len;
+#endif
+}
+
 void
 memcache_deregister(void *md, ib_buflist_t *buflist)
 {
@@ -219,8 +253,10 @@ memcache_deregister(void *md, ib_buflist
 #if ENABLE_MEMCACHE
        memcache_entry_t *c = buflist->memcache[i];
        --c->count;
-       debug(2, "%s: dec refcount [%d] %p len %lld count now %d", __func__, i,
-         buflist->buf.send[i], lld(buflist->len[i]), c->count);
+       debug(2,
+          "%s: dec refcount [%d] %p len %lld (via %p len %lld) refcnt now %d",
+          __func__, i, buflist->buf.send[i], lld(buflist->len[i]),
+          c->buf, lld(c->len), c->count);
        /* let garbage collection do ib_mem_deregister(c) for refcnt==0 */
 #else
        (memcache_device->mem_deregister)(buflist->memcache[i]);
_______________________________________________
Pvfs2-developers mailing list
[email protected]
http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers

Reply via email to