See attached files.
Configured using ASSERT_LEVEL_WANTED=0, the execution ratio improved
slightly to about 2:1.
Note that this test uses a wider range of ravel lengths. The execution
time ratio is pretty much constant, falling off only for the smallest
(100 element) ravels.
I did monitor the system this time. All 8 cores were busy at near to
100% utilization. Memory utilization remained below the swap threshold
during the entire test.
Index: configure.ac
===================================================================
--- configure.ac (revision 164)
+++ configure.ac (working copy)
@@ -89,6 +89,20 @@
-lncurses)
])
+# enable OpenMP support upon request
+#
+AC_ARG_WITH( [openmp],
+ [AS_HELP_STRING([--with-openmp],
+ [enable OpenMP (experimental)])],
+ [],
+ [with_openmp=xno])
+
+OPENMP=
+ AS_IF(
+ [test "x$with_openmp" == xyes],
+ [AC_SUBST([OPENMP], ["-fopenmp"])]
+ )
+
AC_CONFIG_FILES([Makefile
debian/Makefile
debian/source/Makefile
Index: src/SkalarFunction.cc
===================================================================
--- src/SkalarFunction.cc (revision 164)
+++ src/SkalarFunction.cc (working copy)
@@ -64,6 +64,7 @@
if (count == 0) return eval_fill_B(B);
Value_P Z(new Value(B->get_shape(), LOC));
+#pragma omp parallel for
loop(c, count)
{
const Cell * cell_B = &B->get_ravel(c);
@@ -138,6 +139,7 @@
const Cell * cell_A = &A->get_ravel(0);
Value_P Z(new Value(B->get_shape(), LOC));
+#pragma omp parallel for
loop(c, count)
{
const Cell * cell_B = &B->get_ravel(c);
@@ -158,6 +160,7 @@
const Cell * cell_B = &B->get_ravel(0);
Value_P Z(new Value(A->get_shape(), LOC));
+#pragma omp parallel for
loop(c, count)
{
const Cell * cell_A = &A->get_ravel(c);
@@ -181,6 +184,7 @@
Value_P Z(new Value(A->get_shape(), LOC));
+#pragma omp parallel for
loop(c, count)
{
const Cell * cell_A = &A->get_ravel(c);
Index: src/SystemLimits.def
===================================================================
--- src/SystemLimits.def (revision 164)
+++ src/SystemLimits.def (working copy)
@@ -24,6 +24,14 @@
syl1("smallest integer" , SMALL_INT ,
-9223372036854770000LL)
syl1("largest numeric exponent" , LARGEST_EXPO , 308
)
syl1("max. shared variable size (bytes)" , MAX_SVAR_SIZE , 65000
)
+#ifdef _OPENMP
+syl2("OpenMP dynamic adjustment" , OMP_DYNAMIC ,
omp_get_dynamic() )
+syl2("OpenMP nested parallelism" , OMP_NESTED ,
omp_get_nested() )
+syl2("OpenMP thread limit" , OMP_LIMIT_THREADS ,
omp_get_thread_limit())
+syl2("OpenMP max active levels" , OMP_MAX_LEVELS ,
omp_get_max_active_levels())
+syl2("OpenMP max threads per region" , OMP_MAX_THREADS ,
omp_get_max_threads())
+syl2("OpenMP processor count" , OMP_NUM_PROCS ,
omp_get_num_procs() )
+#endif
#undef syl1
#undef syl2
Index: src/SystemLimits.hh
===================================================================
--- src/SystemLimits.hh (revision 164)
+++ src/SystemLimits.hh (working copy)
@@ -23,6 +23,10 @@
/// System limits and default values defined for this interpreter.
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
enum
{
#define syl1(_n, e, v) e = v,
Index: src/SystemVariable.cc
===================================================================
--- src/SystemVariable.cc (revision 164)
+++ src/SystemVariable.cc (working copy)
@@ -45,6 +45,10 @@
#include "Value.hh"
#include "Workspace.hh"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
UCS_string Quad_QUOTE::prompt;
ShapeItem Quad_SYL::si_depth_limit = 0;
@@ -929,6 +933,23 @@
ravel_count_limit = cells;
}
+#ifdef _OPENMP
+ else if (x == SYL_OMP_DYNAMIC) // OpenMP dynamic adjustment
+ {
+ if (b < 0 || b > 1) DOMAIN_ERROR;
+ omp_set_dynamic((int)b);
+ }
+ else if (x == SYL_OMP_NESTED) // OpenMP nested parallelism
+ {
+ if (b < 0 || b > 1) DOMAIN_ERROR;
+ omp_set_nested((int)b);
+ }
+ else if (x == SYL_OMP_MAX_LEVELS) // OpenMP max active levels
+ {
+ if (b < 0) DOMAIN_ERROR;
+ omp_set_max_active_levels((int)b);
+ }
+#endif
else
{
INDEX_ERROR;
Index: src/Value.cc
===================================================================
--- src/Value.cc (revision 164)
+++ src/Value.cc (working copy)
@@ -1267,6 +1267,7 @@
++error_count;
}
+#pragma omp parallel for
loop(c, element_count())
{
const Cell * cell = &get_ravel(c);
Index: src/main.cc
===================================================================
--- src/main.cc (revision 164)
+++ src/main.cc (working copy)
@@ -508,6 +508,9 @@
"",
c1,
+#ifdef _OPENMP
+ "Compiled with OpenMP support (experimental)",
+#endif
"",
"Copyright (C) 2008-2014 Dr. Jürgen Sauermann",
"Banner by FIGlet: www.figlet.org",
# Performance test, vim: syntax=apl
∇ MAKE_FUT EXPR;F;TXT
⍝
⍝ ⎕FX a function that executes EXPR CNT times and returns
⍝ the number of milliseconds spent for doing so, and CNT
⍝
TXT← ⊂ 'MS_CNT←FUT CNT;N;Z;MS'
TXT←TXT, ⊂ 'N←0 ◊ CNT←5×⌈CNT÷5 ◊ MS←⎕TS'
TXT←TXT, ⊂ 'LOOP:'
TXT←TXT, ⊂ (0<⍴,EXPR)/'Z←',EXPR
TXT←TXT, ⊂ (0<⍴,EXPR)/'Z←',EXPR
TXT←TXT, ⊂ (0<⍴,EXPR)/'Z←',EXPR
TXT←TXT, ⊂ (0<⍴,EXPR)/'Z←',EXPR
TXT←TXT, ⊂ (0<⍴,EXPR)/'Z←',EXPR
TXT←TXT, ⊂ '→(CNT>N←N+5)⍴LOOP'
TXT←TXT, ⊂ 'MS←1 12 30 24 60 60 1000⊥⎕TS-MS'
TXT←TXT, ⊂ 'MS_CNT←MS,CNT'
F←⎕FX TXT
∇
∇ MS_CNT←RUN_PERF EXPR;MS;CNT;N1;N2
⍝
⍝ return the total time (in ms) to execute EXPR
⍝
⍝ 1. create FUT that runs and measures EXPR N times.
⍝
MAKE_FUT EXPR
⍝
⍝ 2. compute N1 so that FUT N1 takes > 100 ms.
⍝
N1←1
N1←N1×3 ◊ (MS CNT)←FUT N1 ◊ →(MS < 100)⍴⎕LC
⍝
⍝ 3. compute N2 so that FUT N takes about TPP seconds
⍝
N2←⌈TPP×CNT÷MS
⍝
⍝ 4. execute FUT N
⍝
MS_CNT←(MS CNT)←FUT N2
'Running ''' (30↑EXPR) ''' ' (8 0⍕CNT) ' times: ' MS 'ms'
∇
∇T←T0 DELTA EXPR;T2
(MS CNT)←RUN_PERF EXPR ◊ T←MS÷CNT
T←T2←0⌈T-T0 ◊ UNIT←' ms'
→1+(T2≥1)⍴⎕LC ◊ T2←T2×1000 ◊ UNIT←' μs'
→1+(T2≥1)⍴⎕LC ◊ T2←T2×1000 ◊ UNIT←' ns'
TOTAL←TOTAL⍪((8 2⍕T2),UNIT) EXPR (8 0⍕CNT)
∇
∇TOTAL←TPP GO EXPRS;I_1000_1000;I_1000000;R_1000000; T0;T1
⍝ Arguments for benchmarked functions
⍝
I_100000000 ←? 100000000⍴10
R_100000000 ← I_100000000+.1
I_1000000 ←? 1000000⍴10
R_1000000 ← I_1000000+.1
I_10000 ←? 10000⍴10
R_10000 ← I_10000+.1
I_100 ←? 100⍴10
R_100 ← I_100+.1
TOTAL←1 3⍴' Time ' 'Operation' ' CNT'
TOTAL←TOTAL,[1] '-----------' '---------' '--------'
T0←0.0 DELTA ''
0⍴T0 DELTA¨EXPRS
TOTAL←TOTAL,[1] '-----------' '---------' '--------'
∇
EXPRS← 'I_100000000⋆I_100000000' 'I_100000000⋆R_100000000'
EXPRS←EXPRS,'I_1000000⋆I_1000000' 'I_1000000⋆R_1000000'
EXPRS←EXPRS,'I_10000⋆I_10000' 'I_10000⋆R_10000'
EXPRS←EXPRS,'I_100⋆I_100' 'I_100⋆R_100'
T←20000 GO EXPRS
T
)off
$ ./configure ASSERT_LEVEL_WANTED=0
[output elided]
$ make clean all
[output elided]
$ src/apl -f src/testcases/ParallelPerf.apl
______ _ __ __ __ ___ ____ __
/ ____// | / // / / / / | / __ \ / /
/ / __ / |/ // / / / / /| | / /_/ // /
/ /_/ // /| // /_/ / / ___ | / ____// /___
\____//_/ |_/ \____/ /_/ |_|/_/ /_____/
Welcome to GNU APL version 1.2 / 5829
Copyright (C) 2008-2014 Dr. Jürgen Sauermann
Banner by FIGlet: www.figlet.org
This program comes with ABSOLUTELY NO WARRANTY;
for details run: src/apl --gpl.
This program is free software, and you are welcome to redistribute it
according to the GNU Public License (GPL) version 3 or later.
∇ MAKE_FUT EXPR;F;TXT
∇ MS_CNT←RUN_PERF EXPR;MS;CNT;N1;N2
∇T←T0 DELTA EXPR;T2
∇TOTAL←TPP GO EXPRS;I_1000_1000;I_1000000;R_1000000; T0;T1
EXPRS← 'I_100000000⋆I_100000000' 'I_100000000⋆R_100000000'
EXPRS←EXPRS,'I_1000000⋆I_1000000' 'I_1000000⋆R_1000000'
EXPRS←EXPRS,'I_10000⋆I_10000' 'I_10000⋆R_10000'
EXPRS←EXPRS,'I_100⋆I_100' 'I_100⋆R_100'
T←20000 GO EXPRS
Running ' ' 76466910 times: 19227 ms
Running ' I_100000000⋆I_100000000 ' 5 times: 48528 ms
Running ' I_100000000⋆R_100000000 ' 5 times: 57459 ms
Running ' I_1000000⋆I_1000000 ' 215 times: 19465 ms
Running ' I_1000000⋆R_1000000 ' 205 times: 19436 ms
Running ' I_10000⋆I_10000 ' 23225 times: 20190 ms
Running ' I_10000⋆R_10000 ' 22795 times: 20297 ms
Running ' I_100⋆I_100 ' 2039900 times: 20361 ms
Running ' I_100⋆R_100 ' 1978395 times: 19998 ms
T
Time Operation CNT
----------- --------- --------
251.44 ns 76466910
9705.60 ms I_100000000⋆I_100000000 5
11491.80 ms I_100000000⋆R_100000000 5
90.53 ms I_1000000⋆I_1000000 215
94.81 ms I_1000000⋆R_1000000 205
869.07 μs I_10000⋆I_10000 23225
890.16 μs I_10000⋆R_10000 22795
9.73 μs I_100⋆I_100 2039900
9.86 μs I_100⋆R_100 1978395
----------- --------- --------
)off
Goodbye.
$ ./configure ASSERT_LEVEL_WANTED=0 --with-openmp
[output elided]
$ make clean all
[output elided]
$ src/apl -f src/testcases/ParallelPerf.apl
______ _ __ __ __ ___ ____ __
/ ____// | / // / / / / | / __ \ / /
/ / __ / |/ // / / / / /| | / /_/ // /
/ /_/ // /| // /_/ / / ___ | / ____// /___
\____//_/ |_/ \____/ /_/ |_|/_/ /_____/
Welcome to GNU APL version 1.2 / 5829
Compiled with OpenMP support (experimental)
Copyright (C) 2008-2014 Dr. Jürgen Sauermann
Banner by FIGlet: www.figlet.org
This program comes with ABSOLUTELY NO WARRANTY;
for details run: src/apl --gpl.
This program is free software, and you are welcome to redistribute it
according to the GNU Public License (GPL) version 3 or later.
∇ MAKE_FUT EXPR;F;TXT
∇ MS_CNT←RUN_PERF EXPR;MS;CNT;N1;N2
∇T←T0 DELTA EXPR;T2
∇TOTAL←TPP GO EXPRS;I_1000_1000;I_1000000;R_1000000; T0;T1
EXPRS← 'I_100000000⋆I_100000000' 'I_100000000⋆R_100000000'
EXPRS←EXPRS,'I_1000000⋆I_1000000' 'I_1000000⋆R_1000000'
EXPRS←EXPRS,'I_10000⋆I_10000' 'I_10000⋆R_10000'
EXPRS←EXPRS,'I_100⋆I_100' 'I_100⋆R_100'
T←20000 GO EXPRS
Running ' ' 15959460 times: 20800 ms
Running ' I_100000000⋆I_100000000 ' 5 times: 29432 ms
Running ' I_100000000⋆R_100000000 ' 5 times: 28931 ms
Running ' I_1000000⋆I_1000000 ' 525 times: 20758 ms
Running ' I_1000000⋆R_1000000 ' 520 times: 20913 ms
Running ' I_10000⋆I_10000 ' 46230 times: 21508 ms
Running ' I_10000⋆R_10000 ' 46670 times: 21867 ms
Running ' I_100⋆I_100 ' 2199445 times: 22004 ms
Running ' I_100⋆R_100 ' 2175140 times: 21338 ms
T
Time Operation CNT
----------- --------- --------
1.30 μs 15959460
5886.40 ms I_100000000⋆I_100000000 5
5786.20 ms I_100000000⋆R_100000000 5
39.54 ms I_1000000⋆I_1000000 525
40.22 ms I_1000000⋆R_1000000 520
463.94 μs I_10000⋆I_10000 46230
467.24 μs I_10000⋆R_10000 46670
8.70 μs I_100⋆I_100 2199445
8.51 μs I_100⋆R_100 2175140
----------- --------- --------
)off
Goodbye.