Re: Runtime checking of OpenACC parallelism dimensions clauses

Thomas Schwinge Fri, 19 May 2017 04:03:32 -0700

Hi!

Ping.


On Thu, 11 May 2017 14:24:05 +0200, I wrote:
> OK for trunk?
> 
> commit 0ba48b4faf85420fbe12971afdd6e0afe70778bb
> Author: Thomas Schwinge <tho...@codesourcery.com>
> Date:   Fri May 5 16:41:59 2017 +0200
> 
>     Runtime checking of OpenACC parallelism dimensions clauses
>     
>             libgomp/
>             * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrite.
>             * testsuite/lib/libgomp.exp
>             (check_effective_target_openacc_nvidia_accel_configured): New
>             proc.
>             * testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_c)
>             (check_effective_target_c++): New procs.
>             * testsuite/libgomp.oacc-c/c.exp (check_effective_target_c)
>             (check_effective_target_c++): Likewise.
> ---
>  libgomp/testsuite/lib/libgomp.exp                  |  12 +
>  libgomp/testsuite/libgomp.oacc-c++/c++.exp         |   7 +
>  .../libgomp.oacc-c-c++-common/parallel-dims.c      | 523 
> ++++++++++++++++++++-
>  libgomp/testsuite/libgomp.oacc-c/c.exp             |   7 +
>  4 files changed, 537 insertions(+), 12 deletions(-)
> 
> diff --git libgomp/testsuite/lib/libgomp.exp libgomp/testsuite/lib/libgomp.exp
> index 5e47872..62ee2e3 100644
> --- libgomp/testsuite/lib/libgomp.exp
> +++ libgomp/testsuite/lib/libgomp.exp
> @@ -358,6 +358,18 @@ proc check_effective_target_offload_device_shared_as { } 
> {
>      } ]
>  }
>  
> +# Return 1 if configured for nvptx offloading.
> +
> +proc check_effective_target_openacc_nvidia_accel_configured { } {
> +    global offload_targets
> +    if { ![string match "*,nvptx,*" ",$offload_targets,"] } {
> +        return 0
> +    }
> +    # PR libgomp/65099: Currently, we only support offloading in 64-bit
> +    # configurations.
> +    return [is-effective-target lp64]
> +}
> +
>  # Return 1 if at least one nvidia board is present.
>  
>  proc check_effective_target_openacc_nvidia_accel_present { } {
> diff --git libgomp/testsuite/libgomp.oacc-c++/c++.exp 
> libgomp/testsuite/libgomp.oacc-c++/c++.exp
> index 608b298..9beadd6 100644
> --- libgomp/testsuite/libgomp.oacc-c++/c++.exp
> +++ libgomp/testsuite/libgomp.oacc-c++/c++.exp
> @@ -4,6 +4,13 @@ load_lib libgomp-dg.exp
>  load_gcc_lib gcc-dg.exp
>  load_gcc_lib torture-options.exp
>  
> +proc check_effective_target_c { } {
> +    return 0
> +}
> +proc check_effective_target_c++ { } {
> +    return 1
> +}
> +
>  global shlib_ext
>  
>  set shlib_ext [get_shlib_extension]
> diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c 
> libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> index f5766a4..d8af546 100644
> --- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> +++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> @@ -1,25 +1,524 @@
> -/* { dg-do run { target openacc_nvidia_accel_selected } } */
> +/* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
> +   vector_length.  */
> +
> +#include <limits.h>
> +#include <openacc.h>
> +
> +/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
> +   not behaving as expected for -O0.  */
> +#pragma acc routine seq
> +static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
> +{
> +  if (acc_on_device ((int) acc_device_host))
> +    return 0;
> +  else if (acc_on_device ((int) acc_device_nvidia))
> +    {
> +      unsigned int r;
> +      asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
> +      return r;
> +    }
> +  else
> +    __builtin_abort ();
> +}
> +
> +#pragma acc routine seq
> +static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
> +{
> +  if (acc_on_device ((int) acc_device_host))
> +    return 0;
> +  else if (acc_on_device ((int) acc_device_nvidia))
> +    {
> +      unsigned int r;
> +      asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
> +      return r;
> +    }
> +  else
> +    __builtin_abort ();
> +}
> +
> +#pragma acc routine seq
> +static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
> +{
> +  if (acc_on_device ((int) acc_device_host))
> +    return 0;
> +  else if (acc_on_device ((int) acc_device_nvidia))
> +    {
> +      unsigned int r;
> +      asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));
> +      return r;
> +    }
> +  else
> +    __builtin_abort ();
> +}
>  
> -/* Worker and vector size checks.  Picked an outrageously large
> -   value. */
>  
>  int main ()
>  {
> -  int dummy[10];
> +  acc_init (acc_device_default);
>  
> -#pragma acc parallel num_workers (2<<20) /* { dg-error "using num_workers" } 
> */
> +  /* Non-positive value.  */
> +
> +  /* GR, WS, VS.  */
> +  {
> +#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { 
> target c } } */
> +    int gangs_actual = GANGS;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
> vectors_max;
> +    gangs_min = workers_min = vectors_min = INT_MAX;
> +    gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (gangs_actual) \
> +  reduction (min: gangs_min, workers_min, vectors_min) reduction (max: 
> gangs_max, workers_max, vectors_max) \
> +  num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" 
> { target c++ } } */
> +    {
> +      /* We're actually executing with num_gangs (1).  */
> +      gangs_actual = 1;
> +      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
> +     {
> +       /* <https://gcc.gnu.org/PR80547>.  */
> +#if 0
> +       gangs_min = gangs_max = acc_gang ();
> +       workers_min = workers_max = acc_worker ();
> +       vectors_min = vectors_max = acc_vector ();
> +#else
> +       int gangs = acc_gang ();
> +       gangs_min = (gangs_min < gangs) ? gangs_min : gangs;
> +       gangs_max = (gangs_max > gangs) ? gangs_max : gangs;
> +       int workers = acc_worker ();
> +       workers_min = (workers_min < workers) ? workers_min : workers;
> +       workers_max = (workers_max > workers) ? workers_max : workers;
> +       int vectors = acc_vector ();
> +       vectors_min = (vectors_min < vectors) ? vectors_min : vectors;
> +       vectors_max = (vectors_max > vectors) ? vectors_max : vectors;
> +#endif
> +     }
> +    }
> +    if (gangs_actual != 1)
> +      __builtin_abort ();
> +    if (gangs_min != 0 || gangs_max != gangs_actual - 1
> +     || workers_min != 0 || workers_max != 0
> +     || vectors_min != 0 || vectors_max != 0)
> +      __builtin_abort ();
> +#undef GANGS
> +  }
> +
> +  /* GP, WS, VS.  */
> +  {
> +#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { 
> target c } } */
> +    int gangs_actual = GANGS;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
> vectors_max;
> +    gangs_min = workers_min = vectors_min = INT_MAX;
> +    gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (gangs_actual) \
> +  num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" 
> { target c++ } } */
> +    {
> +      /* We're actually executing with num_gangs (1).  */
> +      gangs_actual = 1;
> +#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) 
> reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
> +     {
> +       gangs_min = gangs_max = acc_gang ();
> +       workers_min = workers_max = acc_worker ();
> +       vectors_min = vectors_max = acc_vector ();
> +     }
> +    }
> +    if (gangs_actual != 1)
> +      __builtin_abort ();
> +    if (gangs_min != 0 || gangs_max != gangs_actual - 1
> +     || workers_min != 0 || workers_max != 0
> +     || vectors_min != 0 || vectors_max != 0)
> +      __builtin_abort ();
> +#undef GANGS
> +  }
> +
> +  /* GR, WP, VS.  */
> +  {
> +#define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" 
> { target c } } */
> +    int workers_actual = WORKERS;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
> vectors_max;
> +    gangs_min = workers_min = vectors_min = INT_MAX;
> +    gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (workers_actual) \
> +  num_workers (WORKERS) /* { dg-warning "'num_workers' value must be 
> positive" "" { target c++ } } */
> +    {
> +      /* We're actually executing with num_workers (1).  */
> +      workers_actual = 1;
> +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) 
> reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
> +     {
> +       gangs_min = gangs_max = acc_gang ();
> +       workers_min = workers_max = acc_worker ();
> +       vectors_min = vectors_max = acc_vector ();
> +     }
> +    }
> +    if (workers_actual != 1)
> +      __builtin_abort ();
> +    if (gangs_min != 0 || gangs_max != 0
> +     || workers_min != 0 || workers_max != workers_actual - 1
> +     || vectors_min != 0 || vectors_max != 0)
> +      __builtin_abort ();
> +#undef WORKERS
> +  }
> +
> +  /* GR, WS, VP.  */
> +  {
> +#define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" 
> "" { target c } } */
> +    int vectors_actual = VECTORS;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
> vectors_max;
> +    gangs_min = workers_min = vectors_min = INT_MAX;
> +    gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (vectors_actual) /* { dg-warning "using 
> vector_length \\(32\\), ignoring 1" "" { target 
> openacc_nvidia_accel_configured } } */ \
> +  vector_length (VECTORS) /* { dg-warning "'vector_length' value must be 
> positive" "" { target c++ } } */
> +    {
> +      /* We're actually executing with vector_length (1), just the GCC nvptx
> +      back end enforces vector_length (32).  */
> +      if (acc_on_device (acc_device_nvidia))
> +     vectors_actual = 32;
> +      else
> +     vectors_actual = 1;
> +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) 
> reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
> +     {
> +       gangs_min = gangs_max = acc_gang ();
> +       workers_min = workers_max = acc_worker ();
> +       vectors_min = vectors_max = acc_vector ();
> +     }
> +    }
> +    if (acc_get_device_type () == acc_device_nvidia)
> +      {
> +     if (vectors_actual != 32)
> +       __builtin_abort ();
> +      }
> +    else
> +      if (vectors_actual != 1)
> +     __builtin_abort ();
> +    if (gangs_min != 0 || gangs_max != 0
> +     || workers_min != 0 || workers_max != 0
> +     || vectors_min != 0 || vectors_max != vectors_actual - 1)
> +      __builtin_abort ();
> +#undef VECTORS
> +  }
> +
> +
> +  /* High value.  */
> +  
> +  /* GR, WS, VS.  */
> +  {
> +    /* There is no actual limit for the number of gangs, so we try with a
> +       rather high value.  */
> +    int gangs = 12345;
> +    int gangs_actual = gangs;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
> vectors_max;
> +    gangs_min = workers_min = vectors_min = INT_MAX;
> +    gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (gangs_actual) \
> +  reduction (min: gangs_min, workers_min, vectors_min) reduction (max: 
> gangs_max, workers_max, vectors_max) \
> +  num_gangs (gangs)
> +    {
> +      if (acc_on_device (acc_device_host))
> +     {
> +       /* We're actually executing with num_gangs (1).  */
> +       gangs_actual = 1;
> +     }
> +      /* As we're executing GR not GP, don't multiply with a "gangs_actual"
> +      factor.  */
> +      for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; 
> --i)
> +     {
> +       gangs_min = gangs_max = acc_gang ();
> +       workers_min = workers_max = acc_worker ();
> +       vectors_min = vectors_max = acc_vector ();
> +     }
> +    }
> +    if (gangs_actual < 1)
> +      __builtin_abort ();
> +    if (gangs_min != 0 || gangs_max != gangs_actual - 1
> +     || workers_min != 0 || workers_max != 0
> +     || vectors_min != 0 || vectors_max != 0)
> +      __builtin_abort ();
> +  }
> +
> +  /* GP, WS, VS.  */
> +  {
> +    /* There is no actual limit for the number of gangs, so we try with a
> +       rather high value.  */
> +    int gangs = 12345;
> +    int gangs_actual = gangs;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
> vectors_max;
> +    gangs_min = workers_min = vectors_min = INT_MAX;
> +    gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (gangs_actual) \
> +  num_gangs (gangs)
> +    {
> +      if (acc_on_device (acc_device_host))
> +     {
> +       /* We're actually executing with num_gangs (1).  */
> +       gangs_actual = 1;
> +     }
> +#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) 
> reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
> +     {
> +       gangs_min = gangs_max = acc_gang ();
> +       workers_min = workers_max = acc_worker ();
> +       vectors_min = vectors_max = acc_vector ();
> +     }
> +    }
> +    if (gangs_actual < 1)
> +      __builtin_abort ();
> +    if (gangs_min != 0 || gangs_max != gangs_actual - 1
> +     || workers_min != 0 || workers_max != 0
> +     || vectors_min != 0 || vectors_max != 0)
> +      __builtin_abort ();
> +  }
> +
> +  /* GR, WP, VS.  */
> +  {
> +    /* We try with an outrageously large value. */
> +#define WORKERS 2 << 20
> +    int workers_actual = WORKERS;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
> vectors_max;
> +    gangs_min = workers_min = vectors_min = INT_MAX;
> +    gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (workers_actual) /* { dg-warning "using 
> num_workers \\(32\\), ignoring 2097152" "" { target 
> openacc_nvidia_accel_configured } } */ \
> +  num_workers (WORKERS)
> +    {
> +      if (acc_on_device (acc_device_host))
> +     {
> +       /* We're actually executing with num_workers (1).  */
> +       workers_actual = 1;
> +     }
> +      else if (acc_on_device (acc_device_nvidia))
> +     {
> +       /* The GCC nvptx back end enforces num_workers (32).  */
> +       workers_actual = 32;
> +     }
> +      else
> +     __builtin_abort ();
> +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) 
> reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
> +     {
> +       gangs_min = gangs_max = acc_gang ();
> +       workers_min = workers_max = acc_worker ();
> +       vectors_min = vectors_max = acc_vector ();
> +     }
> +    }
> +    if (workers_actual < 1)
> +      __builtin_abort ();
> +    if (gangs_min != 0 || gangs_max != 0
> +     || workers_min != 0 || workers_max != workers_actual - 1
> +     || vectors_min != 0 || vectors_max != 0)
> +      __builtin_abort ();
> +#undef WORKERS
> +  }
> +
> +  /* GR, WP, VS.  */
> +  {
> +    /* We try with an outrageously large value. */
> +    int workers = 2 << 20;
> +    /* For nvptx offloading, this one will not result in "using num_workers
> +       (32), ignoring runtime setting", and will in fact try to launch with
> +       "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
> +       error: invalid argument".  So, limit ourselves here.  */
> +    if (acc_get_device_type () == acc_device_nvidia)
> +      workers = 32;
> +    int workers_actual = workers;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
> vectors_max;
> +    gangs_min = workers_min = vectors_min = INT_MAX;
> +    gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (workers_actual) \
> +  num_workers (workers)
> +    {
> +      if (acc_on_device (acc_device_host))
> +     {
> +       /* We're actually executing with num_workers (1).  */
> +       workers_actual = 1;
> +     }
> +      else if (acc_on_device (acc_device_nvidia))
> +     {
> +       /* We're actually executing with num_workers (32).  */
> +       /* workers_actual = 32; */
> +     }
> +      else
> +     __builtin_abort ();
> +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) 
> reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
> +     {
> +       gangs_min = gangs_max = acc_gang ();
> +       workers_min = workers_max = acc_worker ();
> +       vectors_min = vectors_max = acc_vector ();
> +     }
> +    }
> +    if (workers_actual < 1)
> +      __builtin_abort ();
> +    if (gangs_min != 0 || gangs_max != 0
> +     || workers_min != 0 || workers_max != workers_actual - 1
> +     || vectors_min != 0 || vectors_max != 0)
> +      __builtin_abort ();
> +  }
> +
> +  /* GR, WS, VP.  */
>    {
> -#pragma acc loop worker
> -    for (int  i = 0; i < 10; i++)
> -      dummy[i] = i;
> +    /* We try with an outrageously large value. */
> +#define VECTORS 2 << 20
> +    int vectors_actual = VECTORS;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
> vectors_max;
> +    gangs_min = workers_min = vectors_min = INT_MAX;
> +    gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (vectors_actual) /* { dg-warning "using 
> vector_length \\(32\\), ignoring 2097152" "" { target 
> openacc_nvidia_accel_configured } } */ \
> +  vector_length (VECTORS)
> +    {
> +      if (acc_on_device (acc_device_host))
> +     {
> +       /* We're actually executing with vector_length (1).  */
> +       vectors_actual = 1;
> +     }
> +      else if (acc_on_device (acc_device_nvidia))
> +     {
> +       /* The GCC nvptx back end enforces vector_length (32).  */
> +       vectors_actual = 32;
> +     }
> +      else
> +     __builtin_abort ();
> +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) 
> reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
> +     {
> +       gangs_min = gangs_max = acc_gang ();
> +       workers_min = workers_max = acc_worker ();
> +       vectors_min = vectors_max = acc_vector ();
> +     }
> +    }
> +    if (vectors_actual < 1)
> +      __builtin_abort ();
> +    if (gangs_min != 0 || gangs_max != 0
> +     || workers_min != 0 || workers_max != 0
> +     || vectors_min != 0 || vectors_max != vectors_actual - 1)
> +      __builtin_abort ();
> +#undef VECTORS
>    }
>  
> -#pragma acc parallel vector_length (2<<20) /* { dg-error "using 
> vector_length" } */
> +  /* GR, WS, VP.  */
>    {
> -#pragma acc loop vector
> -    for (int  i = 0; i < 10; i++)
> -      dummy[i] = i;
> +    /* We try with an outrageously large value. */
> +    int vectors = 2 << 20;
> +    int vectors_actual = vectors;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
> vectors_max;
> +    gangs_min = workers_min = vectors_min = INT_MAX;
> +    gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (vectors_actual) /* { dg-warning "using 
> vector_length \\(32\\), ignoring runtime setting" "" { target 
> openacc_nvidia_accel_configured } } */ \
> +  vector_length (vectors)
> +    {
> +      if (acc_on_device (acc_device_host))
> +     {
> +       /* We're actually executing with vector_length (1).  */
> +       vectors_actual = 1;
> +     }
> +      else if (acc_on_device (acc_device_nvidia))
> +     {
> +       /* The GCC nvptx back end enforces vector_length (32).  */
> +       vectors_actual = 32;
> +     }
> +      else
> +     __builtin_abort ();
> +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) 
> reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
> +     {
> +       gangs_min = gangs_max = acc_gang ();
> +       workers_min = workers_max = acc_worker ();
> +       vectors_min = vectors_max = acc_vector ();
> +     }
> +    }
> +    if (vectors_actual < 1)
> +      __builtin_abort ();
> +    if (gangs_min != 0 || gangs_max != 0
> +     || workers_min != 0 || workers_max != 0
> +     || vectors_min != 0 || vectors_max != vectors_actual - 1)
> +      __builtin_abort ();
>    }
>  
> +
> +  /* Composition of GP, WP, VP.  */
> +  {
> +    int gangs = 12345;
> +    /* With nvptx offloading, multi-level reductions apparently are very slow
> +       in the following case.  So, limit ourselves here.  */
> +    if (acc_get_device_type () == acc_device_nvidia)
> +      gangs = 3;
> +    int gangs_actual = gangs;
> +#define WORKERS 3
> +    int workers_actual = WORKERS;
> +#define VECTORS 11
> +    int vectors_actual = VECTORS;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
> vectors_max;
> +    gangs_min = workers_min = vectors_min = INT_MAX;
> +    gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* 
> { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target 
> openacc_nvidia_accel_configured } } */ \
> +  num_gangs (gangs) \
> +  num_workers (WORKERS) \
> +  vector_length (VECTORS)
> +    {
> +      if (acc_on_device (acc_device_host))
> +     {
> +       /* We're actually executing with num_gangs (1), num_workers (1),
> +          vector_length (1).  */
> +       gangs_actual = 1;
> +       workers_actual = 1;
> +       vectors_actual = 1;
> +     }
> +      else if (acc_on_device (acc_device_nvidia))
> +     {
> +       /* The GCC nvptx back end enforces vector_length (32).  */
> +       vectors_actual = 32;
> +     }
> +      else
> +     __builtin_abort ();
> +#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) 
> reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
> +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) 
> reduction (max: gangs_max, workers_max, vectors_max)
> +     for (int j = 100 * workers_actual; j > -100 * workers_actual; --j)
> +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) 
> reduction (max: gangs_max, workers_max, vectors_max)
> +       for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k)
> +         {
> +           gangs_min = gangs_max = acc_gang ();
> +           workers_min = workers_max = acc_worker ();
> +           vectors_min = vectors_max = acc_vector ();
> +         }
> +    }
> +    if (gangs_min != 0 || gangs_max != gangs_actual - 1
> +     || workers_min != 0 || workers_max != workers_actual - 1
> +     || vectors_min != 0 || vectors_max != vectors_actual - 1)
> +      __builtin_abort ();
> +#undef VECTORS
> +#undef WORKERS
> +  }
> +
> +
> +  /* We can't test parallelized OpenACC kernels constructs in this way: use 
> of
> +     the acc_gang, acc_worker, acc_vector functions will make the construct
> +     unparallelizable.  */
> +
> +
> +  /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
> +     kernels.  */
> +  {
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
> vectors_max;
> +    gangs_min = workers_min = vectors_min = INT_MAX;
> +    gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc kernels
> +    {
> +      /* This is to make the OpenACC kernels construct unparallelizable.  */
> +      asm volatile ("" : : : "memory");
> +
> +#pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) 
> reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i = 100; i > -100; --i)
> +     {
> +       gangs_min = gangs_max = acc_gang ();
> +       workers_min = workers_max = acc_worker ();
> +       vectors_min = vectors_max = acc_vector ();
> +     }
> +    }
> +    if (gangs_min != 0 || gangs_max != 1 - 1
> +     || workers_min != 0 || workers_max != 1 - 1
> +     || vectors_min != 0 || vectors_max != 1 - 1)
> +      __builtin_abort ();
> +  }
> +
> +
>    return 0;
>  }
> diff --git libgomp/testsuite/libgomp.oacc-c/c.exp 
> libgomp/testsuite/libgomp.oacc-c/c.exp
> index b509a10..4475bf5 100644
> --- libgomp/testsuite/libgomp.oacc-c/c.exp
> +++ libgomp/testsuite/libgomp.oacc-c/c.exp
> @@ -15,6 +15,13 @@ load_lib libgomp-dg.exp
>  load_gcc_lib gcc-dg.exp
>  load_gcc_lib torture-options.exp
>  
> +proc check_effective_target_c { } {
> +    return 1
> +}
> +proc check_effective_target_c++ { } {
> +    return 0
> +}
> +
>  # Initialize dg.
>  dg-init
>  torture-init


Grüße
 Thomas

Re: Runtime checking of OpenACC parallelism dimensions clauses

Reply via email to