[Bug-apl] OpenMP patch, test harness and benchmark

David B. Lamkins Sat, 15 Mar 2014 12:23:36 -0700

See attached files.

Configured using ASSERT_LEVEL_WANTED=0, the execution ratio improved
slightly to about 2:1.


Note that this test uses a wider range of ravel lengths. The execution
time ratio is pretty much constant, falling off only for the smallest
(100 element) ravels.

I did monitor the system this time. All 8 cores were busy at near to
100% utilization. Memory utilization remained below the swap threshold
during the entire test.

Index: configure.ac
===================================================================
--- configure.ac        (revision 164)
+++ configure.ac        (working copy)
@@ -89,6 +89,20 @@
            -lncurses)
        ])
 
+# enable OpenMP support upon request
+#
+AC_ARG_WITH( [openmp],
+             [AS_HELP_STRING([--with-openmp],
+             [enable OpenMP (experimental)])],
+             [],
+             [with_openmp=xno])
+
+OPENMP=
+    AS_IF(
+        [test "x$with_openmp" == xyes],
+        [AC_SUBST([OPENMP], ["-fopenmp"])]
+        )
+
 AC_CONFIG_FILES([Makefile
                  debian/Makefile
                  debian/source/Makefile
Index: src/SkalarFunction.cc
===================================================================
--- src/SkalarFunction.cc       (revision 164)
+++ src/SkalarFunction.cc       (working copy)
@@ -64,6 +64,7 @@
    if (count == 0)   return eval_fill_B(B);
 
 Value_P Z(new Value(B->get_shape(), LOC));
+#pragma omp parallel for
    loop(c, count)
        {
          const Cell * cell_B =  &B->get_ravel(c);
@@ -138,6 +139,7 @@
         const Cell * cell_A = &A->get_ravel(0);
         Value_P Z(new Value(B->get_shape(), LOC));
 
+#pragma omp parallel for
         loop(c, count)
             {
               const Cell * cell_B = &B->get_ravel(c);
@@ -158,6 +160,7 @@
         const Cell * cell_B = &B->get_ravel(0);
         Value_P Z(new Value(A->get_shape(), LOC));
 
+#pragma omp parallel for
         loop(c, count)
             {
               const Cell * cell_A = &A->get_ravel(c);
@@ -181,6 +184,7 @@
 
 Value_P Z(new Value(A->get_shape(), LOC));
 
+#pragma omp parallel for
    loop(c, count)
        {
          const Cell * cell_A = &A->get_ravel(c);
Index: src/SystemLimits.def
===================================================================
--- src/SystemLimits.def        (revision 164)
+++ src/SystemLimits.def        (working copy)
@@ -24,6 +24,14 @@
 syl1("smallest integer"                       , SMALL_INT         , 
-9223372036854770000LL)
 syl1("largest numeric exponent"               , LARGEST_EXPO      , 308        
         )
 syl1("max. shared variable size (bytes)"      , MAX_SVAR_SIZE     , 65000      
         )
+#ifdef _OPENMP
+syl2("OpenMP dynamic adjustment"              , OMP_DYNAMIC       , 
omp_get_dynamic()   )
+syl2("OpenMP nested parallelism"              , OMP_NESTED        , 
omp_get_nested()    )
+syl2("OpenMP thread limit"                    , OMP_LIMIT_THREADS , 
omp_get_thread_limit())
+syl2("OpenMP max active levels"               , OMP_MAX_LEVELS    , 
omp_get_max_active_levels())
+syl2("OpenMP max threads per region"          , OMP_MAX_THREADS   , 
omp_get_max_threads())
+syl2("OpenMP processor count"                 , OMP_NUM_PROCS     , 
omp_get_num_procs() )
+#endif
 
 #undef syl1
 #undef syl2
Index: src/SystemLimits.hh
===================================================================
--- src/SystemLimits.hh (revision 164)
+++ src/SystemLimits.hh (working copy)
@@ -23,6 +23,10 @@
 
 ///  System limits and default values defined for this interpreter.
 
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
 enum
 {
 #define syl1(_n, e, v) e = v,
Index: src/SystemVariable.cc
===================================================================
--- src/SystemVariable.cc       (revision 164)
+++ src/SystemVariable.cc       (working copy)
@@ -45,6 +45,10 @@
 #include "Value.hh"
 #include "Workspace.hh"
 
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
 UCS_string Quad_QUOTE::prompt;
 
 ShapeItem Quad_SYL::si_depth_limit = 0;
@@ -929,6 +933,23 @@
 
              ravel_count_limit = cells;
            }
+#ifdef _OPENMP
+        else if (x == SYL_OMP_DYNAMIC)   // OpenMP dynamic adjustment
+          {
+            if (b < 0 || b > 1)   DOMAIN_ERROR;
+            omp_set_dynamic((int)b);
+          }
+        else if (x == SYL_OMP_NESTED)   // OpenMP nested parallelism
+          {
+            if (b < 0 || b > 1)   DOMAIN_ERROR;
+            omp_set_nested((int)b);
+          }
+        else if (x == SYL_OMP_MAX_LEVELS)   // OpenMP max active levels
+          {
+            if (b < 0)   DOMAIN_ERROR;
+            omp_set_max_active_levels((int)b);
+          }
+#endif
         else
            {
              INDEX_ERROR;
Index: src/Value.cc
===================================================================
--- src/Value.cc        (revision 164)
+++ src/Value.cc        (working copy)
@@ -1267,6 +1267,7 @@
                  ++error_count;
       }
 
+#pragma omp parallel for
     loop(c, element_count())
        {
          const Cell * cell = &get_ravel(c);
Index: src/main.cc
===================================================================
--- src/main.cc (revision 164)
+++ src/main.cc (working copy)
@@ -508,6 +508,9 @@
 
   "",
   c1,
+#ifdef _OPENMP
+  "Compiled with OpenMP support (experimental)",
+#endif
   "",
   "Copyright (C) 2008-2014  Dr. Jürgen Sauermann",
   "Banner by FIGlet: www.figlet.org",

# Performance test, vim: syntax=apl

∇ MAKE_FUT EXPR;F;TXT
⍝
⍝ ⎕FX a function that executes EXPR CNT times and returns
⍝ the number of milliseconds spent for doing so, and CNT
⍝
  TXT←     ⊂ 'MS_CNT←FUT CNT;N;Z;MS'
  TXT←TXT, ⊂ 'N←0 ◊ CNT←5×⌈CNT÷5 ◊ MS←⎕TS'
  TXT←TXT, ⊂ 'LOOP:'
  TXT←TXT, ⊂ (0<⍴,EXPR)/'Z←',EXPR
  TXT←TXT, ⊂ (0<⍴,EXPR)/'Z←',EXPR
  TXT←TXT, ⊂ (0<⍴,EXPR)/'Z←',EXPR
  TXT←TXT, ⊂ (0<⍴,EXPR)/'Z←',EXPR
  TXT←TXT, ⊂ (0<⍴,EXPR)/'Z←',EXPR
  TXT←TXT, ⊂ '→(CNT>N←N+5)⍴LOOP'
  TXT←TXT, ⊂ 'MS←1 12 30 24 60 60 1000⊥⎕TS-MS'
  TXT←TXT, ⊂ 'MS_CNT←MS,CNT'
  F←⎕FX TXT
∇

∇ MS_CNT←RUN_PERF EXPR;MS;CNT;N1;N2
⍝
⍝ return the total time (in ms) to execute EXPR
⍝
⍝ 1. create FUT that runs and measures EXPR N times.
⍝
  MAKE_FUT EXPR
⍝
⍝ 2. compute N1 so that FUT N1 takes > 100 ms.
⍝
  N1←1
  N1←N1×3 ◊ (MS CNT)←FUT N1 ◊ →(MS < 100)⍴⎕LC
⍝
⍝ 3. compute N2 so that FUT N takes about TPP seconds
⍝
  N2←⌈TPP×CNT÷MS
⍝
⍝ 4. execute FUT N
⍝
  MS_CNT←(MS CNT)←FUT N2
  'Running  ''' (30↑EXPR) ''' ' (8 0⍕CNT) ' times: ' MS 'ms'
∇

∇T←T0 DELTA EXPR;T2
 (MS CNT)←RUN_PERF EXPR ◊ T←MS÷CNT
 T←T2←0⌈T-T0 ◊ UNIT←' ms'
 →1+(T2≥1)⍴⎕LC ◊ T2←T2×1000 ◊  UNIT←' μs'
 →1+(T2≥1)⍴⎕LC ◊ T2←T2×1000 ◊  UNIT←' ns'
 TOTAL←TOTAL⍪((8 2⍕T2),UNIT) EXPR (8 0⍕CNT)
∇

∇TOTAL←TPP GO EXPRS;I_1000_1000;I_1000000;R_1000000; T0;T1
⍝ Arguments for benchmarked functions
⍝
 I_100000000   ←?  100000000⍴10
 R_100000000   ← I_100000000+.1
 I_1000000     ←?  1000000⍴10
 R_1000000     ← I_1000000+.1
 I_10000       ←?  10000⍴10
 R_10000       ← I_10000+.1
 I_100         ←?  100⍴10
 R_100         ← I_100+.1

 TOTAL←1 3⍴'    Time   ' 'Operation' '     CNT'
 TOTAL←TOTAL,[1] '-----------' '---------' '--------'

 T0←0.0 DELTA ''
  0⍴T0 DELTA¨EXPRS
 TOTAL←TOTAL,[1] '-----------' '---------' '--------'
∇

EXPRS←      'I_100000000⋆I_100000000' 'I_100000000⋆R_100000000' 
EXPRS←EXPRS,'I_1000000⋆I_1000000' 'I_1000000⋆R_1000000' 
EXPRS←EXPRS,'I_10000⋆I_10000' 'I_10000⋆R_10000' 
EXPRS←EXPRS,'I_100⋆I_100' 'I_100⋆R_100' 

T←20000 GO EXPRS

T

)off

$ ./configure ASSERT_LEVEL_WANTED=0
[output elided]
$ make clean all
[output elided]
$ src/apl -f src/testcases/ParallelPerf.apl 
                                       
                    ______ _   __ __  __    ___     ____   __ 
                   / ____// | / // / / /   /   |   / __ \ / / 
                  / / __ /  |/ // / / /   / /| |  / /_/ // /  
                 / /_/ // /|  // /_/ /   / ___ | / ____// /___
                 \____//_/ |_/ \____/   /_/  |_|/_/    /_____/
                                       
                     Welcome to GNU APL version 1.2 / 5829
                                       
                Copyright (C) 2008-2014  Dr. Jürgen Sauermann
                       Banner by FIGlet: www.figlet.org
                                       
                This program comes with ABSOLUTELY NO WARRANTY;
                        for details run: src/apl --gpl.
                                       
     This program is free software, and you are welcome to redistribute it
         according to the GNU Public License (GPL) version 3 or later.
                                       

      
      ∇ MAKE_FUT EXPR;F;TXT
      
      ∇ MS_CNT←RUN_PERF EXPR;MS;CNT;N1;N2
      
      ∇T←T0 DELTA EXPR;T2
      
      ∇TOTAL←TPP GO EXPRS;I_1000_1000;I_1000000;R_1000000; T0;T1
      
      EXPRS←      'I_100000000⋆I_100000000' 'I_100000000⋆R_100000000' 
      EXPRS←EXPRS,'I_1000000⋆I_1000000' 'I_1000000⋆R_1000000' 
      EXPRS←EXPRS,'I_10000⋆I_10000' 'I_10000⋆R_10000' 
      EXPRS←EXPRS,'I_100⋆I_100' 'I_100⋆R_100' 
      
      T←20000 GO EXPRS
 Running  '                                '  76466910  times:  19227 ms 
 Running  ' I_100000000⋆I_100000000        '         5  times:  48528 ms 
 Running  ' I_100000000⋆R_100000000        '         5  times:  57459 ms 
 Running  ' I_1000000⋆I_1000000            '       215  times:  19465 ms 
 Running  ' I_1000000⋆R_1000000            '       205  times:  19436 ms 
 Running  ' I_10000⋆I_10000                '     23225  times:  20190 ms 
 Running  ' I_10000⋆R_10000                '     22795  times:  20297 ms 
 Running  ' I_100⋆I_100                    '   2039900  times:  20361 ms 
 Running  ' I_100⋆R_100                    '   1978395  times:  19998 ms 
      
      T
     Time    Operation                    CNT 
 ----------- ---------               -------- 
   251.44 ns                         76466910 
  9705.60 ms I_100000000⋆I_100000000        5 
 11491.80 ms I_100000000⋆R_100000000        5 
    90.53 ms I_1000000⋆I_1000000          215 
    94.81 ms I_1000000⋆R_1000000          205 
   869.07 μs I_10000⋆I_10000            23225 
   890.16 μs I_10000⋆R_10000            22795 
     9.73 μs I_100⋆I_100              2039900 
     9.86 μs I_100⋆R_100              1978395 
 ----------- ---------               -------- 
      
      )off

Goodbye.
$ ./configure ASSERT_LEVEL_WANTED=0 --with-openmp
[output elided]
$ make clean all
[output elided]
$ src/apl -f src/testcases/ParallelPerf.apl 
                                       
                    ______ _   __ __  __    ___     ____   __ 
                   / ____// | / // / / /   /   |   / __ \ / / 
                  / / __ /  |/ // / / /   / /| |  / /_/ // /  
                 / /_/ // /|  // /_/ /   / ___ | / ____// /___
                 \____//_/ |_/ \____/   /_/  |_|/_/    /_____/
                                       
                     Welcome to GNU APL version 1.2 / 5829
                  Compiled with OpenMP support (experimental)
                                       
                Copyright (C) 2008-2014  Dr. Jürgen Sauermann
                       Banner by FIGlet: www.figlet.org
                                       
                This program comes with ABSOLUTELY NO WARRANTY;
                        for details run: src/apl --gpl.
                                       
     This program is free software, and you are welcome to redistribute it
         according to the GNU Public License (GPL) version 3 or later.
                                       

      
      ∇ MAKE_FUT EXPR;F;TXT
      
      ∇ MS_CNT←RUN_PERF EXPR;MS;CNT;N1;N2
      
      ∇T←T0 DELTA EXPR;T2
      
      ∇TOTAL←TPP GO EXPRS;I_1000_1000;I_1000000;R_1000000; T0;T1
      
      EXPRS←      'I_100000000⋆I_100000000' 'I_100000000⋆R_100000000' 
      EXPRS←EXPRS,'I_1000000⋆I_1000000' 'I_1000000⋆R_1000000' 
      EXPRS←EXPRS,'I_10000⋆I_10000' 'I_10000⋆R_10000' 
      EXPRS←EXPRS,'I_100⋆I_100' 'I_100⋆R_100' 
      
      T←20000 GO EXPRS
 Running  '                                '  15959460  times:  20800 ms 
 Running  ' I_100000000⋆I_100000000        '         5  times:  29432 ms 
 Running  ' I_100000000⋆R_100000000        '         5  times:  28931 ms 
 Running  ' I_1000000⋆I_1000000            '       525  times:  20758 ms 
 Running  ' I_1000000⋆R_1000000            '       520  times:  20913 ms 
 Running  ' I_10000⋆I_10000                '     46230  times:  21508 ms 
 Running  ' I_10000⋆R_10000                '     46670  times:  21867 ms 
 Running  ' I_100⋆I_100                    '   2199445  times:  22004 ms 
 Running  ' I_100⋆R_100                    '   2175140  times:  21338 ms 
      
      T
     Time    Operation                    CNT 
 ----------- ---------               -------- 
     1.30 μs                         15959460 
  5886.40 ms I_100000000⋆I_100000000        5 
  5786.20 ms I_100000000⋆R_100000000        5 
    39.54 ms I_1000000⋆I_1000000          525 
    40.22 ms I_1000000⋆R_1000000          520 
   463.94 μs I_10000⋆I_10000            46230 
   467.24 μs I_10000⋆R_10000            46670 
     8.70 μs I_100⋆I_100              2199445 
     8.51 μs I_100⋆R_100              2175140 
 ----------- ---------               -------- 
      
      )off

Goodbye.

[Bug-apl] OpenMP patch, test harness and benchmark

Reply via email to