This patch adds: // New functions to set/get the current default async queue void acc_set_default_async (int); int acc_get_default_async (void);
and _async versions of a few existing API functions: void acc_copyin_async (void *, size_t, int); void acc_create_async (void *, size_t, int); void acc_copyout_async (void *, size_t, int); void acc_delete_async (void *, size_t, int); void acc_update_device_async (void *, size_t, int); void acc_update_self_async (void *, size_t, int); void acc_memcpy_to_device_async (void *, void *, size_t, int); void acc_memcpy_from_device_async (void *, void *, size_t, int); These implement part of the additional requirements for OpenACC 2.5 Tested and committed to gomp-4_0-branch. Chung-Lin 2017-02-13 Chung-Lin Tang <clt...@codesourcery.com> libgomp/ * oacc-async.c (acc_get_default_async): New API function. (acc_set_default_async): Likewise. * oacc-init.c (): * oacc-int.h (struct goacc_thread): Add default_async field. * oacc-mem.c (memcpy_tofrom_device): New function, combined from acc_memcpy_to/from_device functions, now with async parameter. (acc_memcpy_to_device): Modify to use memcpy_tofrom_device. (acc_memcpy_from_device): Likewise. (acc_memcpy_to_device_async): New API function. (acc_memcpy_from_device_async): Likewise. (present_create_copy): Add async parameter. (acc_create): Adjust present_create_copy call. (acc_copyin): Likewise. (acc_present_or_create): Likewise. (acc_present_or_copyin): Likewise. (acc_create_async): New API function. (acc_copyin_async): New API function. (delete_copyout): Add async parameter. (acc_delete): Adjust delete_copyout call. (acc_copyout): Likewise. (acc_delete_async): New API function. (acc_copyout_async): Likewise. (update_dev_host): Add async parameter. (acc_update_device): Adjust update_dev_host call. (acc_update_self): Likewise. (acc_update_device_async): New API function. (acc_update_self_async): Likewise. * oacc-plugin.c (GOMP_PLUGIN_acc_thread_default_async): New function. * oacc-plugin.h (GOMP_PLUGIN_acc_thread_default_async): Declare. * openacc.f90 (acc_async_default): Declare. (acc_set_default_async): Likewise. (acc_get_default_async): Likewise. * openacc_lib.h (acc_async_default): Declare. (acc_set_default_async): Likewise. (acc_get_default_async): Likewise. * testsuite/libgomp.oacc-c-c++-common/asyncwait-2.c: New test. * testsuite/libgomp.oacc-c-c++-common/lib-94.c: New test. * testsuite/libgomp.oacc-c-c++-common/lib-95.c: New test. * testsuite/libgomp.oacc-fortran/lib-16.f90: New test. include/ * gomp-constants.h (GOMP_ASYNC_DEFAULT): Define.
Index: libgomp/oacc-async.c =================================================================== --- libgomp/oacc-async.c (revision 245382) +++ libgomp/oacc-async.c (working copy) @@ -105,3 +105,28 @@ acc_wait_all_async (int async) thr->dev->openacc.async_wait_all_async_func (async); } + +int +acc_get_default_async (void) +{ + struct goacc_thread *thr = goacc_thread (); + + if (!thr || !thr->dev) + gomp_fatal ("no device active"); + + return thr->default_async; +} + +void +acc_set_default_async (int async) +{ + if (async < acc_async_sync) + gomp_fatal ("invalid async argument: %d", async); + + struct goacc_thread *thr = goacc_thread (); + + if (!thr || !thr->dev) + gomp_fatal ("no device active"); + + thr->default_async = async; +} Index: libgomp/oacc-init.c =================================================================== --- libgomp/oacc-init.c (revision 245382) +++ libgomp/oacc-init.c (working copy) @@ -437,6 +437,8 @@ goacc_attach_host_thread_to_device (int ord) thr->target_tls = acc_dev->openacc.create_thread_data_func (ord); + + thr->default_async = acc_async_default; acc_dev->openacc.async_set_async_func (acc_async_sync); } Index: libgomp/oacc-int.h =================================================================== --- libgomp/oacc-int.h (revision 245382) +++ libgomp/oacc-int.h (working copy) @@ -73,6 +73,9 @@ struct goacc_thread /* Target-specific data (used by plugin). */ void *target_tls; + + /* Default OpenACC async queue for current thread, exported to plugin. */ + int default_async; }; #if defined HAVE_TLS || defined USE_EMUTLS Index: libgomp/oacc-mem.c =================================================================== --- libgomp/oacc-mem.c (revision 245382) +++ libgomp/oacc-mem.c (working copy) @@ -153,8 +153,9 @@ acc_free (void *d) gomp_fatal ("error in freeing device memory in %s", __FUNCTION__); } -void -acc_memcpy_to_device (void *d, void *h, size_t s) +static void +memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, + const char *libfnname) { /* No need to call lazy open here, as the device pointer must have been obtained from a routine that did that. */ @@ -164,31 +165,49 @@ acc_free (void *d) if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) { - memmove (d, h, s); + if (from) + memmove (h, d, s); + else + memmove (d, h, s); return; } - if (!thr->dev->host2dev_func (thr->dev->target_id, d, h, s)) - gomp_fatal ("error in %s", __FUNCTION__); + if (async > acc_async_sync) + thr->dev->openacc.async_set_async_func (async); + + bool ret = (from + ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s) + : thr->dev->host2dev_func (thr->dev->target_id, d, h, s)); + + if (async > acc_async_sync) + thr->dev->openacc.async_set_async_func (acc_async_sync); + + if (!ret) + gomp_fatal ("error in %s", libfnname); } void -acc_memcpy_from_device (void *h, void *d, size_t s) +acc_memcpy_to_device (void *d, void *h, size_t s) { - /* No need to call lazy open here, as the device pointer must have - been obtained from a routine that did that. */ - struct goacc_thread *thr = goacc_thread (); + memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__); +} - assert (thr && thr->dev); +void +acc_memcpy_to_device_async (void *d, void *h, size_t s, int async) +{ + memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__); +} - if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) - { - memmove (h, d, s); - return; - } +void +acc_memcpy_from_device (void *h, void *d, size_t s) +{ + memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__); +} - if (!thr->dev->dev2host_func (thr->dev->target_id, h, d, s)) - gomp_fatal ("error in %s", __FUNCTION__); +void +acc_memcpy_from_device_async (void *h, void *d, size_t s, int async) +{ + memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__); } /* Return the device pointer that corresponds to host data H. Or NULL @@ -424,7 +443,7 @@ acc_unmap_data (void *h) #define FLAG_COPY (1 << 2) static void * -present_create_copy (unsigned f, void *h, size_t s) +present_create_copy (unsigned f, void *h, size_t s, int async) { void *d; splay_tree_key n; @@ -481,9 +500,15 @@ static void * gomp_mutex_unlock (&acc_dev->lock); + if (async > acc_async_sync) + acc_dev->openacc.async_set_async_func (async); + tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true, GOMP_MAP_VARS_OPENACC); + if (async > acc_async_sync) + acc_dev->openacc.async_set_async_func (acc_async_sync); + gomp_mutex_lock (&acc_dev->lock); d = tgt->to_free; @@ -499,31 +524,44 @@ static void * void * acc_create (void *h, size_t s) { - return present_create_copy (FLAG_CREATE, h, s); + return present_create_copy (FLAG_CREATE, h, s, acc_async_sync); } +void +acc_create_async (void *h, size_t s, int async) +{ + present_create_copy (FLAG_CREATE, h, s, async); +} + void * acc_copyin (void *h, size_t s) { - return present_create_copy (FLAG_CREATE | FLAG_COPY, h, s); + return present_create_copy (FLAG_CREATE | FLAG_COPY, h, s, acc_async_sync); } +void +acc_copyin_async (void *h, size_t s, int async) +{ + present_create_copy (FLAG_CREATE | FLAG_COPY, h, s, async); +} + void * acc_present_or_create (void *h, size_t s) { - return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); + return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync); } void * acc_present_or_copyin (void *h, size_t s) { - return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s); + return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, + acc_async_sync); } #define FLAG_COPYOUT (1 << 0) static void -delete_copyout (unsigned f, void *h, size_t s, const char *libfnname) +delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname) { size_t host_size; splay_tree_key n; @@ -561,11 +599,17 @@ static void gomp_mutex_unlock (&acc_dev->lock); + if (async > acc_async_sync) + acc_dev->openacc.async_set_async_func (async); + if (f & FLAG_COPYOUT) acc_dev->dev2host_func (acc_dev->target_id, h, d, s); acc_unmap_data (h); + if (async > acc_async_sync) + acc_dev->openacc.async_set_async_func (acc_async_sync); + if (!acc_dev->free_func (acc_dev->target_id, d)) gomp_fatal ("error in freeing device memory in %s", libfnname); } @@ -573,17 +617,29 @@ static void void acc_delete (void *h , size_t s) { - delete_copyout (0, h, s, __FUNCTION__); + delete_copyout (0, h, s, acc_async_sync, __FUNCTION__); } void +acc_delete_async (void *h , size_t s, int async) +{ + delete_copyout (0, h, s, async, __FUNCTION__); +} + +void acc_copyout (void *h, size_t s) { - delete_copyout (FLAG_COPYOUT, h, s, __FUNCTION__); + delete_copyout (FLAG_COPYOUT, h, s, acc_async_sync, __FUNCTION__); } +void +acc_copyout_async (void *h, size_t s, int async) +{ + delete_copyout (FLAG_COPYOUT, h, s, async, __FUNCTION__); +} + static void -update_dev_host (int is_dev, void *h, size_t s) +update_dev_host (int is_dev, void *h, size_t s, int async) { splay_tree_key n; void *d; @@ -609,27 +665,45 @@ static void d = (void *) (n->tgt->tgt_start + n->tgt_offset + (uintptr_t) h - n->host_start); + if (async > acc_async_sync) + acc_dev->openacc.async_set_async_func (async); + if (is_dev) acc_dev->host2dev_func (acc_dev->target_id, d, h, s); else acc_dev->dev2host_func (acc_dev->target_id, h, d, s); + if (async > acc_async_sync) + acc_dev->openacc.async_set_async_func (acc_async_sync); + gomp_mutex_unlock (&acc_dev->lock); } void acc_update_device (void *h, size_t s) { - update_dev_host (1, h, s); + update_dev_host (1, h, s, acc_async_sync); } void +acc_update_device_async (void *h, size_t s, int async) +{ + update_dev_host (1, h, s, async); +} + +void acc_update_self (void *h, size_t s) { - update_dev_host (0, h, s); + update_dev_host (0, h, s, acc_async_sync); } void +acc_update_self_async (void *h, size_t s, int async) +{ + update_dev_host (0, h, s, async); +} + +void gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes, void *kinds) { Index: libgomp/oacc-plugin.c =================================================================== --- libgomp/oacc-plugin.c (revision 245382) +++ libgomp/oacc-plugin.c (working copy) @@ -49,3 +49,12 @@ GOMP_PLUGIN_acc_thread (void) struct goacc_thread *thr = goacc_thread (); return thr ? thr->target_tls : NULL; } + +/* Return the default async number from the TLS data for the current thread. */ + +int +GOMP_PLUGIN_acc_thread_default_async (void) +{ + struct goacc_thread *thr = goacc_thread (); + return thr ? thr->default_async : acc_async_default; +} Index: libgomp/oacc-plugin.h =================================================================== --- libgomp/oacc-plugin.h (revision 245382) +++ libgomp/oacc-plugin.h (working copy) @@ -29,5 +29,6 @@ extern void GOMP_PLUGIN_async_unmap_vars (void *, int); extern void *GOMP_PLUGIN_acc_thread (void); +extern int GOMP_PLUGIN_acc_thread_default_async (void); #endif Index: libgomp/openacc.f90 =================================================================== --- libgomp/openacc.f90 (revision 245382) +++ libgomp/openacc.f90 (working copy) @@ -51,9 +51,10 @@ module openacc_kinds integer, parameter :: acc_handle_kind = int32 - public :: acc_async_noval, acc_async_sync + public :: acc_async_default, acc_async_noval, acc_async_sync ! Keep in sync with include/gomp-constants.h. + integer (acc_handle_kind), parameter :: acc_async_default = 0 integer (acc_handle_kind), parameter :: acc_async_noval = -1 integer (acc_handle_kind), parameter :: acc_async_sync = -2 @@ -92,6 +93,16 @@ module openacc_internal integer (acc_device_kind) d end function + subroutine acc_set_default_async_h (a) + import + integer a + end subroutine + + function acc_get_default_async_h () + import + integer acc_get_default_async_h + end function + function acc_async_test_h (a) logical acc_async_test_h integer a @@ -296,6 +307,150 @@ module openacc_internal logical acc_is_present_array_h type (*), dimension (..), contiguous :: a end function + + subroutine acc_copyin_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyin_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyin_async_array_h (a, async) + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + end subroutine + + subroutine acc_create_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_create_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_create_async_array_h (a, async) + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyout_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyout_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyout_async_array_h (a, async) + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + end subroutine + + subroutine acc_delete_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_delete_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_delete_async_array_h (a, async) + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_device_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_device_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_device_async_array_h (a, async) + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_self_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_self_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_self_async_array_h (a, async) + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + end subroutine end interface interface @@ -458,6 +613,60 @@ module openacc_internal type (*), dimension (*) :: a integer (c_size_t), value :: len end function + + subroutine acc_copyin_async_l (a, len, async) & + bind (C, name = "acc_copyin_async") + use iso_c_binding, only: c_size_t, c_int + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_size_t), value :: len + integer (c_int), value :: async + end subroutine + + subroutine acc_create_async_l (a, len, async) & + bind (C, name = "acc_create_async") + use iso_c_binding, only: c_size_t, c_int + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_size_t), value :: len + integer (c_int), value :: async + end subroutine + + subroutine acc_copyout_async_l (a, len, async) & + bind (C, name = "acc_copyout_async") + use iso_c_binding, only: c_size_t, c_int + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_size_t), value :: len + integer (c_int), value :: async + end subroutine + + subroutine acc_delete_async_l (a, len, async) & + bind (C, name = "acc_delete_async") + use iso_c_binding, only: c_size_t, c_int + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_size_t), value :: len + integer (c_int), value :: async + end subroutine + + subroutine acc_update_device_async_l (a, len, async) & + bind (C, name = "acc_update_device_async") + use iso_c_binding, only: c_size_t, c_int + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_size_t), value :: len + integer (c_int), value :: async + end subroutine + + subroutine acc_update_self_async_l (a, len, async) & + bind (C, name = "acc_update_self_async") + use iso_c_binding, only: c_size_t, c_int + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_size_t), value :: len + integer (c_int), value :: async + end subroutine end interface end module @@ -470,11 +679,14 @@ module openacc public :: acc_get_num_devices, acc_set_device_type, acc_get_device_type public :: acc_set_device_num, acc_get_device_num, acc_async_test + public :: acc_set_default_async, acc_get_default_async public :: acc_async_test_all, acc_wait, acc_wait_async, acc_wait_all public :: acc_wait_all_async, acc_init, acc_shutdown, acc_on_device public :: acc_copyin, acc_present_or_copyin, acc_pcopyin, acc_create public :: acc_present_or_create, acc_pcreate, acc_copyout, acc_delete public :: acc_update_device, acc_update_self, acc_is_present + public :: acc_copyin_async, acc_create_async, acc_copyout_async + public :: acc_delete_async, acc_update_device_async, acc_update_self_async integer, parameter :: openacc_version = 201306 @@ -498,6 +710,14 @@ module openacc procedure :: acc_get_device_num_h end interface + interface acc_set_default_async + procedure :: acc_set_default_async_h + end interface + + interface acc_get_default_async + procedure :: acc_get_default_async_h + end interface + interface acc_async_test procedure :: acc_async_test_h end interface @@ -618,6 +838,42 @@ module openacc ! acc_memcpy_to_device: Only available in C/C++ ! acc_memcpy_from_device: Only available in C/C++ + interface acc_copyin_async + procedure :: acc_copyin_async_32_h + procedure :: acc_copyin_async_64_h + procedure :: acc_copyin_async_array_h + end interface + + interface acc_create_async + procedure :: acc_create_async_32_h + procedure :: acc_create_async_64_h + procedure :: acc_create_async_array_h + end interface + + interface acc_copyout_async + procedure :: acc_copyout_async_32_h + procedure :: acc_copyout_async_64_h + procedure :: acc_copyout_async_array_h + end interface + + interface acc_delete_async + procedure :: acc_delete_async_32_h + procedure :: acc_delete_async_64_h + procedure :: acc_delete_async_array_h + end interface + + interface acc_update_device_async + procedure :: acc_update_device_async_32_h + procedure :: acc_update_device_async_64_h + procedure :: acc_update_device_async_array_h + end interface + + interface acc_update_self_async + procedure :: acc_update_self_async_32_h + procedure :: acc_update_self_async_64_h + procedure :: acc_update_self_async_array_h + end interface + end module function acc_get_num_devices_h (d) @@ -954,3 +1210,189 @@ function acc_is_present_array_h (a) type (*), dimension (..), contiguous :: a acc_is_present_array_h = acc_is_present_l (a, sizeof (a)) == 1 end function + +subroutine acc_copyin_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t, c_size_t, c_int + use openacc_internal, only: acc_copyin_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_copyin_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t, c_size_t, c_int + use openacc_internal, only: acc_copyin_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_copyin_async_array_h (a, async) + use iso_c_binding, only: c_int + use openacc_internal, only: acc_copyin_async_l + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + call acc_copyin_async_l (a, sizeof (a), int (async, kind = c_int)) +end subroutine + +subroutine acc_create_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t, c_size_t, c_int + use openacc_internal, only: acc_create_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_create_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t, c_size_t, c_int + use openacc_internal, only: acc_create_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_create_async_array_h (a, async) + use iso_c_binding, only: c_int + use openacc_internal, only: acc_create_async_l + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + call acc_create_async_l (a, sizeof (a), int (async, kind = c_int)) +end subroutine + +subroutine acc_copyout_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t, c_size_t, c_int + use openacc_internal, only: acc_copyout_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_copyout_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t, c_size_t, c_int + use openacc_internal, only: acc_copyout_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_copyout_async_array_h (a, async) + use iso_c_binding, only: c_int + use openacc_internal, only: acc_copyout_async_l + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + call acc_copyout_async_l (a, sizeof (a), int (async, kind = c_int)) +end subroutine + +subroutine acc_delete_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t, c_size_t, c_int + use openacc_internal, only: acc_delete_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_delete_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t, c_size_t, c_int + use openacc_internal, only: acc_delete_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_delete_async_array_h (a, async) + use iso_c_binding, only: c_int + use openacc_internal, only: acc_delete_async_l + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + call acc_delete_async_l (a, sizeof (a), int (async, kind = c_int)) +end subroutine + +subroutine acc_update_device_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t, c_size_t, c_int + use openacc_internal, only: acc_update_device_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_update_device_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t, c_size_t, c_int + use openacc_internal, only: acc_update_device_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_update_device_async_array_h (a, async) + use iso_c_binding, only: c_int + use openacc_internal, only: acc_update_device_async_l + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + call acc_update_device_async_l (a, sizeof (a), int (async, kind = c_int)) +end subroutine + +subroutine acc_update_self_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t, c_size_t, c_int + use openacc_internal, only: acc_update_self_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_update_self_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t, c_size_t, c_int + use openacc_internal, only: acc_update_self_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_update_self_async_array_h (a, async) + use iso_c_binding, only: c_int + use openacc_internal, only: acc_update_self_async_l + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + call acc_update_self_async_l (a, sizeof (a), int (async, kind = c_int)) +end subroutine Index: libgomp/openacc.h =================================================================== --- libgomp/openacc.h (revision 245382) +++ libgomp/openacc.h (working copy) @@ -63,6 +63,7 @@ typedef enum acc_device_t { typedef enum acc_async_t { /* Keep in sync with include/gomp-constants.h. */ + acc_async_default = 0, acc_async_noval = -1, acc_async_sync = -2 } acc_async_t; @@ -72,6 +73,8 @@ void acc_set_device_type (acc_device_t) __GOACC_NO acc_device_t acc_get_device_type (void) __GOACC_NOTHROW; void acc_set_device_num (int, acc_device_t) __GOACC_NOTHROW; int acc_get_device_num (acc_device_t) __GOACC_NOTHROW; +void acc_set_default_async (int) __GOACC_NOTHROW; +int acc_get_default_async (void) __GOACC_NOTHROW; int acc_async_test (int) __GOACC_NOTHROW; int acc_async_test_all (void) __GOACC_NOTHROW; void acc_wait (int) __GOACC_NOTHROW; @@ -105,6 +108,16 @@ int acc_is_present (void *, size_t) __GOACC_NOTHRO void acc_memcpy_to_device (void *, void *, size_t) __GOACC_NOTHROW; void acc_memcpy_from_device (void *, void *, size_t) __GOACC_NOTHROW; +/* Async functions, specified in OpenACC 2.5. */ +void acc_copyin_async (void *, size_t, int) __GOACC_NOTHROW; +void acc_create_async (void *, size_t, int) __GOACC_NOTHROW; +void acc_copyout_async (void *, size_t, int) __GOACC_NOTHROW; +void acc_delete_async (void *, size_t, int) __GOACC_NOTHROW; +void acc_update_device_async (void *, size_t, int) __GOACC_NOTHROW; +void acc_update_self_async (void *, size_t, int) __GOACC_NOTHROW; +void acc_memcpy_to_device_async (void *, void *, size_t, int) __GOACC_NOTHROW; +void acc_memcpy_from_device_async (void *, void *, size_t, int) __GOACC_NOTHROW; + /* Old names. OpenACC does not specify whether these can or must not be macros, inlines or aliases for the new names. */ #define acc_pcreate acc_present_or_create Index: libgomp/openacc_lib.h =================================================================== --- libgomp/openacc_lib.h (revision 245382) +++ libgomp/openacc_lib.h (working copy) @@ -46,6 +46,7 @@ integer, parameter :: acc_handle_kind = 4 ! Keep in sync with include/gomp-constants.h. + integer (acc_handle_kind), parameter :: acc_async_default = 0 integer (acc_handle_kind), parameter :: acc_async_noval = -1 integer (acc_handle_kind), parameter :: acc_async_sync = -2 @@ -89,6 +90,18 @@ end function end interface + interface acc_set_default_async + subroutine acc_set_default_async_h (a) + integer a + end subroutine + end interface + + interface acc_get_default_async + function acc_get_default_async_h () + integer acc_get_default_async_h + end function + end interface + interface acc_async_test function acc_async_test_h (a) logical acc_async_test_h @@ -380,3 +393,159 @@ ! acc_memcpy_to_device: Only available in C/C++ ! acc_memcpy_from_device: Only available in C/C++ + + interface acc_copyin_async + subroutine acc_copyin_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyin_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyin_async_array_h (a, async_) + import acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async_ + end subroutine + end interface + + interface acc_create_async + subroutine acc_create_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_create_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_create_async_array_h (a, async_) + import acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async_ + end subroutine + end interface + + interface acc_copyout_async + subroutine acc_copyout_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyout_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyout_async_array_h (a, async_) + import acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async_ + end subroutine + end interface + + interface acc_delete_async + subroutine acc_delete_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_delete_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_delete_async_array_h (a, async_) + import acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async_ + end subroutine + end interface + + interface acc_update_device_async + subroutine acc_update_device_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_device_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_device_async_array_h (a, async_) + import acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async_ + end subroutine + end interface + + interface acc_update_self_async + subroutine acc_update_self_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_self_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_self_async_array_h (a, async_) + import acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async_ + end subroutine + end interface Index: libgomp/plugin/plugin-nvptx.c =================================================================== --- libgomp/plugin/plugin-nvptx.c (revision 245382) +++ libgomp/plugin/plugin-nvptx.c (working copy) @@ -414,13 +414,10 @@ select_stream_for_async (int async, pthread_t thre struct ptx_stream *stream = NULL; int orig_async = async; - /* The special value acc_async_noval (-1) maps (for now) to an - implicitly-created stream, which is then handled the same as any other - numbered async stream. Other options are available, e.g. using the null - stream for anonymous async operations, or choosing an idle stream from an - active set. But, stick with this for now. */ - if (async > acc_async_sync) - async++; + /* The special value acc_async_noval (-1) maps to the thread-specific + default async stream. */ + if (async == acc_async_noval) + async = GOMP_PLUGIN_acc_thread_default_async (); if (create) pthread_mutex_lock (&ptx_dev->stream_lock); Index: libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 =================================================================== --- libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 (revision 0) +++ libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 (revision 0) @@ -0,0 +1,57 @@ +! { dg-do run } +! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } + +program main + use openacc + implicit none + + integer, parameter :: N = 256 + integer, allocatable :: h(:) + integer :: i + integer :: async = 5 + + allocate (h(N)) + + do i = 1, N + h(i) = i + end do + + call acc_copyin (h) + + do i = 1, N + h(i) = i + i + end do + + call acc_update_device_async (h, sizeof (h), async) + + if (acc_is_present (h) .neqv. .TRUE.) call abort + + h(:) = 0 + + call acc_copyout_async (h, sizeof (h), async) + + call acc_wait (async) + + do i = 1, N + if (h(i) /= i + i) call abort + end do + + call acc_copyin (h, sizeof (h)) + + h(:) = 0 + + call acc_update_self_async (h, sizeof (h), async) + + if (acc_is_present (h) .neqv. .TRUE.) call abort + + do i = 1, N + if (h(i) /= i + i) call abort + end do + + call acc_delete_async (h, async) + + call acc_wait (async) + + if (acc_is_present (h) .neqv. .FALSE.) call abort + +end program Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c (revision 0) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c (revision 0) @@ -0,0 +1,42 @@ +/* { dg-do run } */ +/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */ + +#include <string.h> +#include <stdlib.h> +#include <openacc.h> + +int +main (int argc, char **argv) +{ + const int N = 256; + int i; + int async = 8; + unsigned char *h; + + h = (unsigned char *) malloc (N); + + for (i = 0; i < N; i++) + { + h[i] = i; + } + + acc_copyin_async (h, N, async); + + memset (h, 0, N); + + acc_wait (async); + + acc_copyout_async (h, N, async + 1); + + acc_wait (async + 1); + + for (i = 0; i < N; i++) + { + if (h[i] != i) + abort (); + } + + free (h); + + return 0; +} Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c (revision 0) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c (revision 0) @@ -0,0 +1,45 @@ +/* { dg-do run } */ +/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */ + +#include <string.h> +#include <stdlib.h> +#include <openacc.h> + +int +main (int argc, char **argv) +{ + const int N = 256; + int i, q = 5; + unsigned char *h, *g; + void *d; + + h = (unsigned char *) malloc (N); + g = (unsigned char *) malloc (N); + for (i = 0; i < N; i++) + { + g[i] = i; + } + + acc_create_async (h, N, q); + + acc_memcpy_to_device_async (acc_deviceptr (h), g, N, q); + memset (&h[0], 0, N); + + acc_wait (q); + + acc_update_self_async (h, N, q + 1); + acc_delete_async (h, N, q + 1); + + acc_wait (q + 1); + + for (i = 0; i < N; i++) + { + if (h[i] != i) + abort (); + } + + free (h); + free (g); + + return 0; +} Index: libgomp/testsuite/libgomp.oacc-c-c++-common/asyncwait-2.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/asyncwait-2.c (revision 0) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/asyncwait-2.c (revision 0) @@ -0,0 +1,904 @@ +/* { dg-do run { target openacc_nvidia_accel_selected } } */ +/* { dg-additional-options "-lcuda" } */ + +#include <openacc.h> +#include <stdlib.h> +#include "cuda.h" + +#include <stdio.h> +#include <time.h> +#include <sys/time.h> + +int +main (int argc, char **argv) +{ + CUresult r; + CUstream stream1; + int N = 128; //1024 * 1024; + float *a, *b, *c, *d, *e; + int i; + int nbytes; + + srand (time (NULL)); + int s = rand () % 100; + + acc_init (acc_device_nvidia); + + nbytes = N * sizeof (float); + + a = (float *) malloc (nbytes); + b = (float *) malloc (nbytes); + c = (float *) malloc (nbytes); + d = (float *) malloc (nbytes); + e = (float *) malloc (nbytes); + + for (i = 0; i < N; i++) + { + a[i] = 3.0; + b[i] = 0.0; + } + + acc_set_default_async (s); + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = a[ii]; + } + +#pragma acc wait + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 3.0) + abort (); + + if (b[i] != 3.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 2.0; + b[i] = 0.0; + } + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 2.0) + abort (); + + if (b[i] != 2.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 3.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + } + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 3.0) + abort (); + + if (b[i] != 9.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 2.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc parallel wait (s) async (s) + { + int ii; + + for (ii = 0; ii < N; ii++) + e[ii] = a[ii] + b[ii] + c[ii] + d[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 2.0) + abort (); + + if (b[i] != 4.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + + if (e[i] != 11.0) + abort (); + } + + + r = cuStreamCreate (&stream1, CU_STREAM_NON_BLOCKING); + if (r != CUDA_SUCCESS) + { + fprintf (stderr, "cuStreamCreate failed: %d\n", r); + abort (); + } + + acc_set_cuda_stream (1, stream1); + + for (i = 0; i < N; i++) + { + a[i] = 5.0; + b[i] = 0.0; + } + +#pragma acc data copy (a[0:N], b[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 5.0) + abort (); + + if (b[i] != 5.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 7.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + } + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 7.0) + abort (); + + if (b[i] != 49.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 3.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc parallel wait (s) async (s) + { + int ii; + + for (ii = 0; ii < N; ii++) + e[ii] = a[ii] + b[ii] + c[ii] + d[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 3.0) + abort (); + + if (b[i] != 9.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + + if (e[i] != 17.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 4.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 4.0) + abort (); + + if (b[i] != 16.0) + abort (); + + if (c[i] != 4.0) + abort (); + } + + + for (i = 0; i < N; i++) + { + a[i] = 5.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc update host (a[0:N], b[0:N], c[0:N]) async + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 5.0) + abort (); + + if (b[i] != 25.0) + abort (); + + if (c[i] != 4.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 3.0; + b[i] = 0.0; + } + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = a[ii]; + } + +#pragma acc wait + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 3.0) + abort (); + + if (b[i] != 3.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 2.0; + b[i] = 0.0; + } + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 2.0) + abort (); + + if (b[i] != 2.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 3.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + } + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 3.0) + abort (); + + if (b[i] != 9.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 2.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc kernels wait (s) async (s) + { + int ii; + + for (ii = 0; ii < N; ii++) + e[ii] = a[ii] + b[ii] + c[ii] + d[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 2.0) + abort (); + + if (b[i] != 4.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + + if (e[i] != 11.0) + abort (); + } + + + r = cuStreamCreate (&stream1, CU_STREAM_NON_BLOCKING); + if (r != CUDA_SUCCESS) + { + fprintf (stderr, "cuStreamCreate failed: %d\n", r); + abort (); + } + + acc_set_cuda_stream (1, stream1); + + for (i = 0; i < N; i++) + { + a[i] = 5.0; + b[i] = 0.0; + } + +#pragma acc data copy (a[0:N], b[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 5.0) + abort (); + + if (b[i] != 5.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 7.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + } + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 7.0) + abort (); + + if (b[i] != 49.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 3.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc kernels wait (s) async (s) + { + int ii; + + for (ii = 0; ii < N; ii++) + e[ii] = a[ii] + b[ii] + c[ii] + d[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 3.0) + abort (); + + if (b[i] != 9.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + + if (e[i] != 17.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 4.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 4.0) + abort (); + + if (b[i] != 16.0) + abort (); + + if (c[i] != 4.0) + abort (); + } + + + for (i = 0; i < N; i++) + { + a[i] = 5.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc update host (a[0:N], b[0:N], c[0:N]) async + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 5.0) + abort (); + + if (b[i] != 25.0) + abort (); + + if (c[i] != 4.0) + abort (); + } + + acc_shutdown (acc_device_nvidia); + + return 0; +} Index: include/gomp-constants.h =================================================================== --- include/gomp-constants.h (revision 245382) +++ include/gomp-constants.h (working copy) @@ -182,6 +182,7 @@ enum gomp_map_kind /* Asynchronous behavior. Keep in sync with libgomp/{openacc.h,openacc.f90,openacc_lib.h}:acc_async_t. */ +#define GOMP_ASYNC_DEFAULT 0 #define GOMP_ASYNC_NOVAL -1 #define GOMP_ASYNC_SYNC -2