Alex,

Thanks you very much for sharing these interesting details about meep-mpi. For some reason that I cannot explain (yet), meep-mpi runs load now splits up evenly on our platform. However, the scaling far from being ideal as you report

HDF5.  In the complete absence of HDF5 I/O, we found meep-mpi to show
near optimal scaling out to 32 processors or more.

My benchmark 3D model follows the simple following points:
- PML at the top & bottom (z)
- Bloch conditions (complex)
- eps-averging on
- only free space between the PML (empty geometry)
- total size xyz : a*a*(6*a) with a resolution of 160 (160*160*(6*160) grid)
- gaussian source plane
- no HDF5 output, not even epsilon

I attached my ctl file below. I you have a minute, can you tell me how it scales on your cluster (say, for 2, 4 & 8 processors)? In advance, thanks a lot.
Here are my results for 4 and 8 processors:

_____________________________________________________________
4 procs: runtime = 465s (16 s to initialize/average of the structure)
8 procs: runtime = 476s ( 7 s to initialize/average of the structure)
_____________________________________________________________________________
mpirun -genv I_MPI_SPIN_COUNT 1 -np 4 meep-mpi res=160 runtime0=2 testmpi.ctl
4 procs: runtime = 465s (16 s to initialize/average of the structure)
______________________stats for 4 processors_____________________
Tasks: 228 total,   5 running, 222 sleeping,   0 stopped,   1 zombie
Cpu2  : 95.7%us,  4.3%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu4  : 95.0%us,  5.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu9  : 94.7%us,  5.3%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu15 : 95.3%us,  4.7%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Mem:  16411088k total,  6047088k used, 10364000k free,      256k buffers
Swap:        0k total,        0k used,        0k free,   304812k cached

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
16811 gdemesy   25   0 1243m 1.1g 6804 R 100.1  7.0   1:19.76 meep-mpi
16812 gdemesy   25   0 1243m 1.1g 6808 R 100.1  7.0   1:19.76 meep-mpi
16813 gdemesy   25   0 1220m 1.1g 7080 R 100.1  6.8   1:19.75 meep-mpi
16814 gdemesy   25   0 1217m 1.1g 7284 R 100.1  6.8   1:19.76 meep-mpi
_________________________________________________________________
mpirun -genv I_MPI_SPIN_COUNT 1 -np 8 meep-mpi res=160 runtime0=2 testmpi.ctl
8 procs: runtime = 476s ( 7 s to initialize/average of the structure)
______________________stats for 8 processors_____________________
Tasks: 236 total,   9 running, 226 sleeping,   0 stopped,   1 zombie
Cpu0  : 85.4%us, 14.6%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu1  : 90.4%us,  9.6%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu2  : 94.4%us,  5.6%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu3  : 95.0%us,  5.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu4  : 94.7%us,  5.3%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu5  : 94.7%us,  5.3%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu14 : 91.1%us,  8.9%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu15 : 86.1%us, 13.9%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Mem:  16411088k total,  6187864k used, 10223224k free,      256k buffers
Swap:        0k total,        0k used,        0k free,   308068k cached
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
16271 gdemesy   25   0  768m 573m 6864 R 100.1  3.6   0:46.85 meep-mpi
16273 gdemesy   25   0  772m 577m 6880 R 100.1  3.6   0:46.85 meep-mpi
16277 gdemesy   25   0  751m 556m 6844 R 100.1  3.5   0:46.84 meep-mpi
16278 gdemesy   25   0  751m 556m 6804 R 100.1  3.5   0:46.85 meep-mpi
16272 gdemesy   25   0  768m 574m 7064 R 99.8  3.6   0:46.84 meep-mpi
16274 gdemesy   25   0  768m 573m 6892 R 99.8  3.6   0:46.84 meep-mpi
16275 gdemesy   25   0  772m 577m 6960 R 99.8  3.6   0:46.84 meep-mpi
16276 gdemesy   25   0  771m 578m 6832 R 99.8  3.6   0:46.84 meep-mpi
_________________________________________________________________
ctl file: mpitest.ctl
_________________________________________________________________

(reset-meep)

(define-param decay_value 1.e-3)

(define-param res 20)

(define-param nfreq 1000)

(define-param runtime 700)

(define-param runtime0 400)

(set! eps-averaging? true)



(set-param! resolution res)

(define-param a    1.)

(define-param fcen 0.285)

(define-param df 0.08)

(define-param eps-si 11.9)

(define-param sigma-si-norm 0.01)

(define-param sigma-si-d (/ sigma-si-norm eps-si))

(define mat-silicon (make dielectric (epsilon eps-si) (D-conductivity sigma-si-d) ) )

(define mat-air (make dielectric (epsilon 1.0 ) ) )



(define-param zpcmin (* -1.5 a) )

(define-param zpcmax (*  1.5 a) )

(define-param hpc    (- zpcmax zpcmin))

(define-param hsuper (*  0.5 a))

(define-param hsubs  (*  0.5 a))

(define-param r-pores-moy  (*  0.25 a))

(define-param mod-pores    (*  0.05 a))



(define-param sx  (* 1. a) )

(define-param sy  (* 1. a) )

(define-param sz  (+ hsuper hsubs hpc))

(define-param dpml a)



(define-param kx (/  0. a) )

(define-param ky (/ -0.2 a) )



(define-param kpara (sqrt (+ (* kx kx) (* ky ky) ) ) )

(define-param Exampl (* -1 ky (/ 1 kpara) ) )

(define-param Eyampl (*    kx (/ 1 kpara) ) )



(define-param theta (asin (/ kx fcen) ) )

(define-param phi   (atan (/ ky kx) ) )

(define-param theta_deg (* 180. (/ 1 pi) (asin (/ kx fcen))) )





(define szpml (+ sz (* 2. dpml)))

(set! geometry-lattice (make lattice (size sx sy szpml)))

(set! pml-layers (list (make pml (thickness dpml) (direction Z))))

(set! ensure-periodicity true)

(set! k-point (vector3 kx ky 0))

(define (my-amp-func p) (* (exp (* 0+2i pi kx (vector3-x p))) (exp (* 0+2i pi ky (vector3-y p))) ) )



(set! sources (list

               (make source

                 (src (make gaussian-src (frequency fcen) (fwidth df)))

                 (component Ex)

                 (center 0. 0. (+ zpcmax (* 0.9 hsuper)) ) (size sx sy 0)

                 (amp-func my-amp-func)

                 (amplitude Exampl ))

                )

)

(run-until runtime0)

_____________________________________________________________________________

Best regards,


Guillaume


[email protected] a écrit :

Send meep-discuss mailing list submissions to
        [email protected]

To subscribe or unsubscribe via the World Wide Web, visit
        http://ab-initio.mit.edu/cgi-bin/mailman/listinfo/meep-discuss
or, via email, send a message with subject or body 'help' to
        [email protected]

You can reach the person managing the list at
        [email protected]

When replying, please edit your Subject line so it is more specific
than "Re: Contents of meep-discuss digest..."


Today's Topics:

   1. Re: meep-discuss Digest, Vol 53, Issue 12 (Alex McLeod)


----------------------------------------------------------------------

Message: 1
Date: Sat, 17 Jul 2010 04:20:02 -0700
From: Alex McLeod <[email protected]>
Subject: Re: [Meep-discuss] meep-discuss Digest, Vol 53, Issue 12
To: [email protected]
Message-ID: <[email protected]>
Content-Type: text/plain; charset="iso-8859-1"; Format="flowed";
        DelSp="yes"

Nizamov, Guillaume,

I can speak from my own off-hand experience using meep-mpi on our
cluster, whose technical details I list below:

vulcan.lbl.gov (1936 PE)
Dell PowerEdge R610 Cluster
242 dual-socket, quad-core Intel 2.4Ghz Nehalem processor nodes
5808GB aggregate memory
48TB Bluearc NFS storage
60TB DDN S2A6620 Lustre storage
Qlogic QDR Infiniband interconnect
18.5 TF (theoretical peak)

We have compiled meep-mpi with OpenMPI-intel-1.4.1 and against HDF5
1.8.4p1-intel-serial.  When running massive 3D volume calculations
with PMLs on all boundaries and with frequent heavy HDF5 I/O, I
achieve the fastest calculation speeds with around 16 processors while
using 4 processors per node.  In all cases I observe pretty even
memory distribution, so long as the simulation volume in voxel units
divides evenly by 16.

The HDF5 I/O actually slows the overall calculation by a factor of 2
on account of overhead associated with HDF5 calls.  For our use case,
we found this overhead to be even greater with parallel HDF5, which is
evidently optimized for writing of datasets far larger than we have
the capacity to compute with FDTD.  So, we have stuck with serial
HDF5.  In the complete absence of HDF5 I/O, we found meep-mpi to show
near optimal scaling out to 32 processors or more.

Guillaume, what benchmark are you running exactly?  I.e., are you
using HDF5 output, and if so, how frequently and over what volumes, or
any additional field computations, flux volumes, etc.?

Best,
Alex
____________________________________________________________________

Alexander S. McLeod
B.A. Physics and Astrophysics - University of California at Berkeley
Simulation Engineer - Theory Group, Molecular Foundry (LBNL)
Site Lead - Network for Computational Nanotechnology at Berkeley / MIT
[email protected]    707-853-0716
____________________________________________________________________

On Jul 16, 2010, at 5:54 AM, [email protected]
wrote:

From: [email protected]
Date: July 16, 2010 5:54:32 AM PDT
To: Nizamov Shawkat <[email protected]>
Cc: [email protected]
Subject: Re: [Meep-discuss] meep-mpi scaling perf with more than 2
processors


Hi Nizamov,

Thanks for your comments. I should mention the fact that the previous
job correspond to normalization run, where you only have freespace &
PMLs. My I have only one source plane term and a set of Bloch
conditions. I
really don't know how meep splits the domain into chunks, but I was
figuring that this was done along the propagation direction.
You are right, I may have to look at the source :\

Below are the results for 8 procs.

Tasks: 237 total,   9 running, 227 sleeping,   0 stopped,   1 zombie
Cpu1  : 55.3%us, 44.7%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu2  : 54.4%us, 45.6%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu5  : 52.8%us, 47.2%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu6  : 52.6%us, 47.4%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu8  : 40.4%us, 59.6%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu15 : 54.0%us, 46.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu3  : 99.1%us,  0.9%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu4  : 99.1%us,  0.9%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Mem:  16411088k total, 11423856k used,  4987232k free,      256k
buffers
Swap:        0k total,        0k used,        0k free,   275088k
cached

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
 5907 gdemesy   25   0  946m 751m 6840 R 104.9  4.7   2:06.09 meep-mpi
 5909 gdemesy   25   0  949m 755m 7000 R 104.9  4.7   2:06.11 meep-mpi
 5908 gdemesy   25   0  946m 751m 6856 R 104.6  4.7   2:06.12 meep-mpi
 5906 gdemesy   25   0  946m 751m 7068 R 102.1  4.7   2:06.02 meep-mpi
 5902 gdemesy   25   0  949m 755m 7036 R 101.8  4.7   2:06.02 meep-mpi
 5903 gdemesy   25   0  946m 751m 6892 R 101.8  4.7   2:06.02 meep-mpi
 5905 gdemesy   25   0 2798m 2.5g 6992 R 101.8 16.3   2:06.02 meep-mpi
 5904 gdemesy   25   0 2794m 2.5g 7096 R 101.5 16.2   2:06.02 meep-mpi

Again, my 10Gb load is not evenly split... And the run is even
longer than with 4 processors.
If we modify the structure, say by removing Bloch conditions, the
load is again unevenly dispatched:

Cpu3  : 99.4%us,  0.6%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu4  : 99.4%us,  0.6%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu0  : 47.3%us, 52.7%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu1  : 47.9%us, 52.1%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu2  : 47.6%us, 52.4%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu5  : 47.9%us, 52.1%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu6  : 48.1%us, 51.9%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu7  : 47.1%us, 52.9%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Mem:  16411088k total,  8093036k used,  8318052k free,      256k
buffers
 PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
9116 gdemesy   25   0 2330m 2.1g 6224 R 100.9 13.3   0:35.67 meep-mpi
9117 gdemesy   25   0 2332m 2.1g 6152 R 100.9 13.3   0:35.66 meep-mpi
9119 gdemesy   25   0  561m 366m 6088 R 101.3  2.3   0:35.67 meep-mpi
9118 gdemesy   25   0  561m 366m 6204 R 100.9  2.3   0:35.66 meep-mpi
9120 gdemesy   25   0  558m 363m 5788 R 100.9  2.3   0:35.66 meep-mpi
9114 gdemesy   25   0  563m 368m 6088 R 100.6  2.3   0:35.66 meep-mpi
9115 gdemesy   25   0  561m 366m 6164 R 100.6  2.3   0:35.66 meep-mpi
9121 gdemesy   25   0  560m 365m 5776 R 100.6  2.3   0:35.65 meep-mpi

Now let's add the slab to this dummy job... Does'nt change much:
Cpu11 : 99.7%us,  0.3%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu4  : 99.7%us,  0.3%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu0  : 46.8%us, 53.2%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu2  : 47.3%us, 52.7%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu7  : 43.9%us, 56.1%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu9  : 47.1%us, 52.9%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu13 : 47.4%us, 52.6%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu14 : 48.2%us, 51.8%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Mem:  16411088k total,  8586420k used,  7824668k free,      256k
buffers
 PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
9732 gdemesy   25   0  561m 367m 6884 R 100.3  2.3   1:19.13 meep-mpi
9733 gdemesy   25   0 2571m 2.3g 7020 R 100.3 14.8   1:19.12 meep-mpi
9731 gdemesy   25   0  563m 368m 6636 R 100.0  2.3   1:19.12 meep-mpi
9734 gdemesy   25   0 2573m 2.3g 6832 R 100.0 14.8   1:19.12 meep-mpi
9735 gdemesy   25   0  561m 367m 6900 R 100.0  2.3   1:19.12 meep-mpi
9736 gdemesy   25   0  561m 367m 6624 R 100.0  2.3   1:19.11 meep-mpi
9737 gdemesy   25   0  560m 365m 6152 R 100.0  2.3   1:19.12 meep-mpi
9738 gdemesy   25   0  558m 363m 6116 R 100.0  2.3   1:19.13 meep-mpi

Thanks for your help anyway... I will keep you posted if I manage to
get better perf.

Best,

Guillaume



Nizamov Shawkat <[email protected]> a ?crit :

In your case, have you witnessed this kind of unbalanced
behavior   (unbalanced memory, I
mean)?

Sorry, I do not remember exact details.

Let's see once again:

18175    25   0  353m 221m 6080 R  99.8  1.4   1:10.41  1  meep-mpi
18174    25   0  354m 222m 6388 R 100.2  1.4   1:10.41  6  meep-mpi
18172    25   0 1140m 1.0g 7016 R  99.8  6.3   1:10.41  2  meep-mpi
18173    25   0 1140m 1.0g 6804 R  99.5  6.3   1:10.40  4  meep-mpi

Tasks: 228 total,   5 running, 222 sleeping,   0 stopped,   1 zombie
Cpu1  : 23.9%us, 76.1%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu6  : 23.3%us, 76.7%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu2  : 99.7%us,  0.3%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st
Cpu4  : 99.7%us,  0.3%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,
0.0%si,  0.0%st

Well, it may be possible, that simulation space is divided unevenly.
In this case, results seem quite natural - bigger simulation volumes
(cpu2 and cpu4) run at their full speed, 3-4 times smaller volumes
(cpu1 and cpu6) complete their simulation steps circa 3 times faster
and waste the time waiting for two other cores.

If this is correct interpretation, then there is nothing wrong with
you setup and:

1) it should mean that splitting of overall simulation volume onto
separate per core simulation volumes was not performed optimally by
meep. Any meep developer to comment ? I remember that splitting
algorithms took into account the structure and optimized
correspondingly the splitting volumes. E.g., cores 1 and 6 may be
actually simulating the slab volume, while cores 2 and 4 are
calculating the free space/PML. Try without slab to see if in that
case the distribution will be even.

2) scaling might be much better when you further increase the  number
of cores, because simulation volume may be divided more evenly.  Can
you try it ?

Actually, it would be interesting to compare how simulation volume is
divided at different number of processor cores, with and without
slab,
and this may give a clue how splitting works. Another option is to
look at the sources :)

With best regards
Shawkat Nizamov





----------------------------------------------------------------
This message was sent using IMP, the Internet Messaging Program.






_______________________________________________
meep-discuss mailing list
[email protected]
http://ab-initio.mit.edu/cgi-bin/mailman/listinfo/meep-discuss

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://ab-initio.mit.edu/pipermail/meep-discuss/attachments/20100717/4433ff92/attachment.html>

------------------------------

_______________________________________________
meep-discuss mailing list
[email protected]
http://ab-initio.mit.edu/cgi-bin/mailman/listinfo/meep-discuss

End of meep-discuss Digest, Vol 53, Issue 14
********************************************




----------------------------------------------------------------
This message was sent using IMP, the Internet Messaging Program.

(reset-meep)
(define-param decay_value 1.e-3) 
(define-param res 20)
(define-param nfreq 1000)
(define-param runtime 700)
(define-param runtime0 400)
(set! eps-averaging? true)

(set-param! resolution res)
(define-param a    1.)
(define-param fcen 0.285)
(define-param df 0.08)
(define-param eps-si 11.9)
(define-param sigma-si-norm 0.01)
(define-param sigma-si-d (/ sigma-si-norm eps-si))
(define mat-silicon (make dielectric (epsilon eps-si) (D-conductivity 
sigma-si-d) ) )
(define mat-air     (make dielectric (epsilon 1.0   )                           
  ) )

(define-param zpcmin (* -1.5 a) )
(define-param zpcmax (*  1.5 a) )
(define-param hpc    (- zpcmax zpcmin))
(define-param hsuper (*  0.5 a))
(define-param hsubs  (*  0.5 a))
(define-param r-pores-moy  (*  0.25 a))
(define-param mod-pores    (*  0.05 a))

(define-param sx  (* 1. a) )
(define-param sy  (* 1. a) )
(define-param sz  (+ hsuper hsubs hpc))
(define-param dpml a)

(define-param kx (/  0. a) )
(define-param ky (/ -0.2 a) )

(define-param kpara (sqrt (+ (* kx kx) (* ky ky) ) ) )
(define-param Exampl (* -1 ky (/ 1 kpara) ) )
(define-param Eyampl (*    kx (/ 1 kpara) ) )

(define-param theta (asin (/ kx fcen) ) )
(define-param phi   (atan (/ ky kx) ) )
(define-param theta_deg (* 180. (/ 1 pi) (asin (/ kx fcen))) )


(define szpml (+ sz (* 2. dpml)))
(set! geometry-lattice (make lattice (size sx sy szpml)))
(set! pml-layers (list (make pml (thickness dpml) (direction Z))))
(set! ensure-periodicity true)
(set! k-point (vector3 kx ky 0))
(define (my-amp-func p) (* (exp (* 0+2i pi kx (vector3-x p)))  (exp (* 0+2i pi 
ky (vector3-y p)))   ) )

(set! sources (list
               (make source
                 (src (make gaussian-src (frequency fcen) (fwidth df)))
                 (component Ex)
                 (center 0. 0. (+ zpcmax (* 0.9 hsuper)) ) (size sx sy 0)
                 (amp-func my-amp-func)
                 (amplitude Exampl ))
                )
)
(run-until runtime0)

_______________________________________________
meep-discuss mailing list
[email protected]
http://ab-initio.mit.edu/cgi-bin/mailman/listinfo/meep-discuss

Reply via email to