On 2018/12/19 5:03 AM, Thomas Schwinge wrote:
Hi Chung-Lin!

On Tue, 18 Dec 2018 23:06:38 +0800, Chung-Lin Tang <chunglin_t...@mentor.com> 
wrote:
this part includes some of the lookup_goacc_asyncqueue fixes we talked about.
I am still thinking about how the queue lock problem should really be solved, 
so regard
this patch as just fixing some of the problems.

Hi Thomas,
This is my solution to the queue lock stuff we talked about. I've only attached 
a diff to
be applied atop of the existing changes, though that may actually be easier to 
review.

Note that this is still in testing, which means it hasn't been tested :P, but 
I'm
posting to see if you have time to give it a look before the holidays.

Having the need for a lock on the async queues is quite irritating, especially
when the structure needed for managing them is quite simple.

Therefore, lets do away the need for locks entirely.

This patch makes the asyncqueue part of the device->openacc.async managed by 
lock-free
atomic operations; almost all of the complexity is contained in 
lookup_goacc_asyncqueue(),
so it should be not too complex. A descriptor and the queue array is 
allocated/exchanged
atomically when more storage is required, while in the common case a simple 
lookup is enough.
The fact that we manage asyncqueues by only expanding and never destroying 
asyncqueues
during the device lifetime also simplifies many things.

The current implementation may be not that optimized and clunky in some cases, 
but I think
this should be the way to implement what should be a fairly simple asyncqueue 
array and active
list. I'll update more on the status as testing proceeds.

(and about the other corners you noticed in the last mail, I'll get to that 
later...)

Thanks,
Chung-Lin







Sure, thanks.

Two comments, though:

--- libgomp/oacc-async.c        (revision 267226)
+++ libgomp/oacc-async.c        (working copy)

+attribute_hidden struct goacc_asyncqueue *
+lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
+{
+  /* The special value acc_async_noval (-1) maps to the thread-specific
+     default async stream.  */
+  if (async == acc_async_noval)
+    async = thr->default_async;
+
+  if (async == acc_async_sync)
+    return NULL;
+
+  if (async < 0)
+    gomp_fatal ("bad async %d", async);
+
+  struct gomp_device_descr *dev = thr->dev;
+
+  gomp_mutex_lock (&dev->openacc.async.lock);
+
+  if (!create
+      && (async >= dev->openacc.async.nasyncqueue
+         || !dev->openacc.async.asyncqueue[async]))
+    {
+      gomp_mutex_unlock (&dev->openacc.async.lock);
+      return NULL;
+    }
+
+  if (async >= dev->openacc.async.nasyncqueue)
+    {
+      int diff = async + 1 - dev->openacc.async.nasyncqueue;
+      dev->openacc.async.asyncqueue
+       = gomp_realloc (dev->openacc.async.asyncqueue,
+                       sizeof (goacc_aq) * (async + 1));
+      memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue,
+             0, sizeof (goacc_aq) * diff);
+      dev->openacc.async.nasyncqueue = async + 1;
+    }
+
+  if (!dev->openacc.async.asyncqueue[async])
+    {
+      dev->openacc.async.asyncqueue[async] = dev->openacc.async.construct_func 
();
+
+      if (!dev->openacc.async.asyncqueue[async])
+       {
+         gomp_mutex_unlock (&dev->openacc.async.lock);
+         gomp_fatal ("async %d creation failed", async);
+       }

That will now always fail for host fallback, where
"host_openacc_async_construct" just always does "return NULL".

Actually, if the device doesn't support asyncqueues, this whole function
should turn into some kind of no-op, so that we don't again and again try
to create a new one for every call to "lookup_goacc_asyncqueue".

I'm attaching one possible solution.  I think it's fine to assume that
the majority of devices will support asyncqueues, and for those that
don't, this is just a one-time overhead per async-argument.  So, no
special handling required in "lookup_goacc_asyncqueue".

+      /* Link new async queue into active list.  */
+      goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
+      n->aq = dev->openacc.async.asyncqueue[async];
+      n->next = dev->openacc.async.active;
+      dev->openacc.async.active = n;
+    }
+  gomp_mutex_unlock (&dev->openacc.async.lock);

You still need to keep "async" locked during...

+  return dev->openacc.async.asyncqueue[async];

... this dereference.

+}


Oh, and:

--- libgomp/oacc-plugin.c       (revision 267226)
+++ libgomp/oacc-plugin.c       (working copy)
@@ -31,14 +31,10 @@
  #include "oacc-int.h"
void
-GOMP_PLUGIN_async_unmap_vars (void *ptr, int async)
+GOMP_PLUGIN_async_unmap_vars (void *ptr __attribute__((unused)),
+                             int async __attribute__((unused)))
  {
-  struct target_mem_desc *tgt = ptr;
-  struct gomp_device_descr *devicep = tgt->device_descr;
-
-  devicep->openacc.async_set_async_func (async);
-  gomp_unmap_vars (tgt, true);
-  devicep->openacc.async_set_async_func (acc_async_sync);
+  gomp_fatal ("invalid plugin function");
  }

Please add a comment here, something like: "Obsolete entry point, no
longer used."


Grüße
  Thomas


diff -ru trunk-orig/libgomp/libgomp.h trunk-work/libgomp/libgomp.h
--- trunk-orig/libgomp/libgomp.h        2018-12-20 23:24:20.040375724 +0800
+++ trunk-work/libgomp/libgomp.h        2018-12-21 23:32:47.938628954 +0800
@@ -937,6 +937,13 @@
 
 #include "splay-tree.h"
 
+struct goacc_asyncqueue_desc
+{
+  int nasyncqueue;
+  struct goacc_asyncqueue **asyncqueue;
+  struct goacc_asyncqueue_list *active;
+};
+
 typedef struct acc_dispatch_t
 {
   /* This is a linked list of data mapped using the
@@ -955,10 +962,7 @@
     *destroy_thread_data_func;
   
   struct {
-    gomp_mutex_t lock;
-    int nasyncqueue;
-    struct goacc_asyncqueue **asyncqueue;
-    struct goacc_asyncqueue_list *active;
+    struct goacc_asyncqueue_desc *aqdesc;
 
     __typeof (GOMP_OFFLOAD_openacc_async_construct) *construct_func;
     __typeof (GOMP_OFFLOAD_openacc_async_destruct) *destruct_func;
diff -ru trunk-orig/libgomp/oacc-async.c trunk-work/libgomp/oacc-async.c
--- trunk-orig/libgomp/oacc-async.c     2018-12-18 22:19:51.923102938 +0800
+++ trunk-work/libgomp/oacc-async.c     2018-12-21 23:40:36.088528890 +0800
@@ -70,45 +70,84 @@
 
   struct gomp_device_descr *dev = thr->dev;
 
-  gomp_mutex_lock (&dev->openacc.async.lock);
+  struct goacc_asyncqueue_desc *aqdesc = dev->openacc.async.aqdesc;
 
-  if (!create
-      && (async >= dev->openacc.async.nasyncqueue
-         || !dev->openacc.async.asyncqueue[async]))
-    {
-      gomp_mutex_unlock (&dev->openacc.async.lock);
-      return NULL;
-    }
+  if (!create)
+    return async < aqdesc->nasyncqueue ? aqdesc->asyncqueue[async] : NULL;
 
-  if (async >= dev->openacc.async.nasyncqueue)
-    {
-      int diff = async + 1 - dev->openacc.async.nasyncqueue;
-      dev->openacc.async.asyncqueue
-       = gomp_realloc (dev->openacc.async.asyncqueue,
-                       sizeof (goacc_aq) * (async + 1));
-      memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue,
-             0, sizeof (goacc_aq) * diff);
-      dev->openacc.async.nasyncqueue = async + 1;
-    }
+  if (async < aqdesc->nasyncqueue && aqdesc->asyncqueue[async])
+    return aqdesc->asyncqueue[async];
 
-  if (!dev->openacc.async.asyncqueue[async])
-    {
-      dev->openacc.async.asyncqueue[async] = dev->openacc.async.construct_func 
();
+  struct goacc_asyncqueue *new_aq = dev->openacc.async.construct_func ();
+  const int inc_size = 8;
+  int new_sz = async + 1 + inc_size;
+  new_sz = (new_sz + inc_size - 1) & ~(inc_size - 1);
+
+  struct goacc_asyncqueue **new_aqvec =
+    gomp_malloc_cleared (sizeof (goacc_aq) * new_sz);
+  new_aqvec[async] = new_aq;
+
+  struct goacc_asyncqueue_desc *ndesc =
+    gomp_malloc (sizeof (struct goacc_asyncqueue_desc));
+  ndesc->asyncqueue = new_aqvec;
+  ndesc->nasyncqueue = new_sz;
+
+  goacc_aq_list nl = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
+  nl->aq = new_aq;
+  ndesc->active = nl;
+
+  /* Loop to atomically create and add a new async queue, without using a lock,
+     though may possibly requiring creating and replacing with a new 
descriptor.
+     This may likely not be as optimized as it can be, although simplifies
+     other handling of async queues.  */
+  do {
+    int curr_num_aqs = aqdesc->nasyncqueue;
+    
+    if (async < curr_num_aqs)
+      {
+       /* Someone else expanded the asyncqeue array to enough size, free the
+          allocated resources.  */
+       free (ndesc->asyncqueue);
+       free (ndesc);
+
+       /* Try to set the new aq to the right place again.  */
+       struct goacc_asyncqueue_desc *nullq = NULL;
+       if (__atomic_compare_exchange (&aqdesc->asyncqueue[async],
+                                      &nullq, &new_aq, false,
+                                      __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE))
+         {
+           /* Push new asyncqueue to front of active list.  */
+           nl->next = aqdesc->active;
+           while (!__atomic_compare_exchange (&aqdesc->active, &nl->next, &nl,
+                                              true, __ATOMIC_ACQ_REL,
+                                              __ATOMIC_ACQUIRE))
+             ;
+         }
+       else
+         {
+           /* Okay, someone else already created 'async', free/destroy
+              everything allocated and return existing queue directly.  */
+           if (!dev->openacc.async.destruct_func (new_aq))
+             gomp_fatal ("error in async queue creation");
+           free (nl);
+         }
+       return aqdesc->asyncqueue[async];
+      }
 
-      if (!dev->openacc.async.asyncqueue[async])
-       {
-         gomp_mutex_unlock (&dev->openacc.async.lock);
-         gomp_fatal ("async %d creation failed", async);
-       }
-      
-      /* Link new async queue into active list.  */
-      goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
-      n->aq = dev->openacc.async.asyncqueue[async];
-      n->next = dev->openacc.async.active;
-      dev->openacc.async.active = n;
-    }
-  gomp_mutex_unlock (&dev->openacc.async.lock);
-  return dev->openacc.async.asyncqueue[async];
+    /* Copy current asyncqueue contents*/
+    memcpy (ndesc->asyncqueue, aqdesc->asyncqueue,
+           sizeof (struct goacc_asyncqueue *) * curr_num_aqs);
+    ndesc->active->next = aqdesc->active;
+
+  } while (!__atomic_compare_exchange (&dev->openacc.async.aqdesc,
+                                      &aqdesc, &ndesc, false,
+                                      __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE));
+
+  /* Free the old stale descriptor and resources in aqdesc.  */
+  free (aqdesc->asyncqueue);
+  free (aqdesc);
+
+  return dev->openacc.async.aqdesc->asyncqueue[async];
 }
 
 attribute_hidden struct goacc_asyncqueue *
@@ -139,14 +178,12 @@
   struct goacc_thread *thr = get_goacc_thread ();
 
   int ret = 1;
-  gomp_mutex_lock (&thr->dev->openacc.async.lock);
-  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+  for (goacc_aq_list l = thr->dev->openacc.async.aqdesc->active; l; l = 
l->next)
     if (!thr->dev->openacc.async.test_func (l->aq))
       {
        ret = 0;
        break;
       }
-  gomp_mutex_unlock (&thr->dev->openacc.async.lock);
   return ret;
 }
 
@@ -194,10 +231,8 @@
 {
   struct gomp_device_descr *dev = get_goacc_thread_device ();
 
-  gomp_mutex_lock (&dev->openacc.async.lock);
-  for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next)
+  for (goacc_aq_list l = dev->openacc.async.aqdesc->active; l; l = l->next)
     dev->openacc.async.synchronize_func (l->aq);
-  gomp_mutex_unlock (&dev->openacc.async.lock);
 }
 
 /* acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all.  
*/
@@ -221,14 +256,12 @@
 
   goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async);
 
-  gomp_mutex_lock (&thr->dev->openacc.async.lock);
-  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+  for (goacc_aq_list l = thr->dev->openacc.async.aqdesc->active; l; l = 
l->next)
     {
       thr->dev->openacc.async.synchronize_func (l->aq);
       if (waiting_queue)
        thr->dev->openacc.async.serialize_func (l->aq, waiting_queue);
     }
-  gomp_mutex_unlock (&thr->dev->openacc.async.lock);
 }
 
 int
@@ -261,30 +294,27 @@
 attribute_hidden void
 goacc_init_asyncqueues (struct gomp_device_descr *devicep)
 {
-  gomp_mutex_init (&devicep->openacc.async.lock);
-  devicep->openacc.async.nasyncqueue = 0;
-  devicep->openacc.async.asyncqueue = NULL;
-  devicep->openacc.async.active = NULL;
+  devicep->openacc.async.aqdesc =
+    gomp_malloc_cleared (sizeof (struct goacc_asyncqueue_desc));
 }
 
 attribute_hidden bool
 goacc_fini_asyncqueues (struct gomp_device_descr *devicep)
 {
+  struct goacc_asyncqueue_desc *aqdesc = devicep->openacc.async.aqdesc;
   bool ret = true;
-  if (devicep->openacc.async.nasyncqueue > 0)
+
+  if (aqdesc->nasyncqueue > 0)
     {
       goacc_aq_list next;
-      for (goacc_aq_list l = devicep->openacc.async.active; l; l = next)
+      for (goacc_aq_list l = aqdesc->active; l; l = next)
        {
          ret &= devicep->openacc.async.destruct_func (l->aq);
          next = l->next;
          free (l);
        }
-      free (devicep->openacc.async.asyncqueue);
-      devicep->openacc.async.nasyncqueue = 0;
-      devicep->openacc.async.asyncqueue = NULL;
-      devicep->openacc.async.active = NULL;
+      free (aqdesc->asyncqueue);
     }
-  gomp_mutex_destroy (&devicep->openacc.async.lock);
+  free (aqdesc);
   return ret;
 }
diff -ru trunk-orig/libgomp/oacc-cuda.c trunk-work/libgomp/oacc-cuda.c
--- trunk-orig/libgomp/oacc-cuda.c      2018-12-18 18:09:08.601913507 +0800
+++ trunk-work/libgomp/oacc-cuda.c      2018-12-21 23:34:10.024738834 +0800
@@ -87,9 +87,7 @@
   if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
     {
       goacc_aq aq = get_goacc_asyncqueue (async);
-      gomp_mutex_lock (&thr->dev->openacc.async.lock);
       ret = thr->dev->openacc.cuda.set_stream_func (aq, stream);
-      gomp_mutex_unlock (&thr->dev->openacc.async.lock);
     }
 
   return ret;

Reply via email to