Hong Zhang schrieb: > Ando, > > I do not see any error message from attached info below. > Even '-log_summary' gives correct display. > I guess you sent us the working output (np=2). >
I have attached 3 files. The one you found with -log_summary printed is indeed the working scenario. The other 2 are hanging. Output of top for np=4 when still "running": 8466 csae1801 25 0 1442m 704m 5708 R 100 5.9 1:20.87 externalsolver 8468 csae1801 25 0 1413m 697m 5052 R 100 5.8 1:13.45 externalsolver 8469 csae1801 25 0 1359m 614m 5148 R 100 5.1 1:12.75 externalsolver 8467 csae1801 25 0 1415m 702m 5096 R 96 5.9 1:13.01 externalsolver Output of top for np=4 when hanging: 8466 csae1801 18 0 1443m 769m 6120 S 0 6.4 2:09.47 externalsolver 8468 csae1801 15 0 1413m 759m 5420 S 0 6.3 2:00.87 externalsolver 8467 csae1801 15 0 1415m 748m 5396 S 0 6.2 2:01.21 externalsolver 8469 csae1801 18 0 1359m 688m 5460 S 0 5.7 2:01.39 externalsolver other processes use about 12% memory in sum. > I would suggest you run your code with debugger, > e.g., '-start_in_debugger'. > When it hangs, type Control-C, > and type 'where' to check where it hangs. > I guess it is hanging somewhere after the numerical factorization because the extrapolated time would match. Using debug-version or nondebug doesn't change the behaviour Output from where (using gdb): #0 0x0000003a0ccc5cdf in poll () from /lib64/libc.so.6 #1 0x00000000011d1024 in MPIDU_Sock_wait (sock_set=0x4464890, millisecond_timeout=4, eventp=0xffffffffffffffff) at sock_wait.i:124 #2 0x00000000011a3203 in MPIDI_CH3I_Progress (blocking=71714960, state=0x4) at ch3_progress.c:1038 #3 0x00000000011843ce in PMPI_Recv (buf=0x4464890, count=4, datatype=-1, source=-1, tag=108517088, comm=168072704, status=0x4f503b0) at recv.c:156 #4 0x0000000000ea9926 in BI_Srecv (ctxt=0x4f522d0, src=-2, msgid=2, bp=0x1813ad8) at BI_Srecv.c:8 #5 0x0000000000ea9414 in BI_SringBR (ctxt=0x4f522d0, bp=0x1813ad8, send=0xea9800 <BI_Ssend>, src=1) at BI_SringBR.c:16 #6 0x0000000000ea22b1 in igebr2d_ (ConTxt=0x7fff0afeb110, scope=0x12a57f8 "Rowwise", top=0x17b9094 "S", m=0x12a57b8, n=0x12a57b8, A=0x7fff0afeb560, lda=0x12a57b8, rsrc=0x7fff0afeb118, csrc=0x7fff0afeb090) at igebr2d_.c:198 #7 0x0000000000e3b0f5 in pdpotf2 (uplo=Invalid C/C++ type code 13 in symbol table. ) at pdpotf2.f:340 #8 0x0000000000e2c818 in pdpotrf (uplo=Invalid C/C++ type code 13 in symbol table. ) at pdpotrf.f:327 #9 0x0000000000c5daf6 in dmumps_146 (myid=0, root= {mblock = 48, nblock = 48, nprow = 2, npcol = 2, myrow = 0, mycol = 0, root_size = 2965, tot_root_size = 2965, cntxt_blacs = 0, rg2l_row = 0x676f0bf, rg2l_col = 0x676f107, ipiv = 0x676f14f, descriptor = {1, 0, 2965, 2965, 48, 48, 0, 0, 1488}, descb = {0, 0, 0, 0, 0, 0, 0, 0, 0}, yes = 4294967295, gridinit_done = 4294967295, lpiv = 1, schur_pointer = 0x676f1eb, schur_mloc = 0, schur_nloc = 0, schur_lld = 0, qr_tau = 0x676f23f, qr_rcond = 0, maxg = 0, gind = 0, grow = 0x676f297, gcos = 0x676f2df, gsin = 0x676f327, elg_max = 0, null_max = 0, elind = 0, euind = 0, nlupdate = 0, nuupdate = 0, perm_row = 0x676f387, perm_col = 0x676f3cf, elrow = 0x676f417, eurow = 0x676f45f, ptrel = 0x676f4a7, ptreu = 0x676f4ef, elelg = 0x676f537, euelg = 0x676f57f, dl = 0x676f5c7}, n=446912, iroot=266997, comm=-2080374780, iw=0x2aaaf5c49010, liw=8275423, ifree=1646107, a=0x2aaab9d6e010, la=125678965, ptrast=0xb09d2fc, ptlust_s=0xb05d200, ptrfac=0xb0727b0, step=0xb4aca20, info={0, 0}, ldlt=1, qr=0, wk=0x2aaacab98ba0, lwk=90267651, keep= {8, 2571, 96, 24, 16, 48, 150, 120, 400, 6875958, 2147483646, 200, 3015153, 3259551, 1655023, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 1646982, 3705, 21863, 8275423, 0, 0, 0, 0, 4, 8, 1, 800, 266997, 160000, -456788, 8, 0, 190998, 190998, 0, 1, 2, 5, 12663, 1, 48, 0, 0, 3, 0, 5, 500, 250, 0, 0, 0, 100, 60, 10, 120, 28139, 84754429, 0, 1, 0, 21863, 0, 0, 0, 1, 2, 30, 0, 2147483647, 1, 0, 5, 4, -8, 100, 1, 70, 70, 0, 1, 4, 0, 0, 0, 1, 0, 0, 0, 4, 12000000, 8791225, 150, 0, 16, 0, 1, 0, 1370, 0, 0, 0, 0, 11315240, 12209064, 0 <repeats 11 times>, 6167135, 3705, 0 <repeats 74 times>, 2214144, 0, 0, 0, 0, 0, 0, -1, 2, 2, 2214144, 201, 2, 0, 1, 0, 50, 1, 0, 0, 5, 2291986, 1670494, 1678547, 142320, 32, 0, 0, 0, 1, 3, 0, 1, 0, 0, 0, 12, 1, 10, 0 <repeats 260 times>}, keep8= {0, 407769668, 177587312, 0, 0, 0, 0, 0, 31341437, 30351541, 35301388, 41892965, 125678965, 12496233, 574564, 0, 37488833, 0 <repeats 91 times>, 120657071, 0, 137362626, 0 <repeats 39 times>}) at dmumps_part7.F:286 #10 0x0000000000c17921 in dmumps_251 (n=446912, iw=0x2aaaf5c49010, liw=8275423, a=0x2aaab9d6e010, la=125678965, nstk_steps=0xb0dd3d0, nbprocfils=0xb0f296c, iflag=0, nd=0x4dbe8f0, fils=0xb661130, step=0xb4aca20, frere=0x4dd3ea0, dad=0x4de9450, cand=0x6a24830, istep_to_iniv2=0x4dfea00, tab_pos_in_pere=0x67bbff0, maxfrt=0, ntotpv=0, ptrist=0xb087d60, ptrast=0xb09d2fc, pimaster=0xb0b2898, pamaster=0xb0c7e34, ptrarw=0xb9c9f40, ptraiw=0xb815840, itloc=0xb107f08, ierror=0, ipool=0xb2bc608, lpool=21867, rinfo={28139655699.833332, 0 <repeats 19 times>}, posfac=35411315, iwpos=1646106, lrlu=90267651, iptrlu=125678965, lrlus=90267651, leaf=1865, nbroot=1, nbrtot=4, uu=0, icntl= {6, 0, 6, -1, 0, 0, 7, 77, 1, 0, 0, 1, 0, 200, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, -8, 0 <repeats 11 times>, 1, 0}, ptlust_s=0xb05d200, ptrfac=0xb0727b0, nsteps=5877, info= {0, 0, 35301388, 1646982, 3705, 0, 8275423, 125678965, 0, 0, 0, 0, 0, 0, 1112, 1112, 421, 0, 8392947, 37488833, 0, 0, 0, 31341437, 0 <repeats 16 times>}, keep= {8, 2571, 96, 24, 16, 48, 150, 120, 400, 6875958, 2147483646, 200, 3015153, 3259551, 1655023, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 1646982, 3705, 21863, 8275423, 0, 0, 0, 0, 4, 8, 1, 800, 266997, 160000, -456788, 8, 0, 190998, 190998, 0, 1, 2, 5, 12663, 1, 48, 0, 0, 3, 0, 5, 500, 250, 0, 0, 0, 100, 60, 10, 120, 28139, 84754429, 0, 1, 0, 21863, 0, 0, 0, 1, 2, 30, 0, 2147483647, 1, 0, 5, 4, -8, 100, 1, 70, 70, 0, 1, 4, 0, 0, 0, 1, 0, 0, 0, 4, 12000000, 8791225, 150, 0, 16, 0, 1, 0, 1370, 0, 0, 0, 0, 11315240, 12209064, 0 <repeats 11 times>, 6167135, 3705, 0 <repeats 74 times>, 2214144, 0, 0, 0, 0, 0, 0, -1, 2, 2, 2214144, 201, 2, 0, 1, 0, 50, 1, 0, 0, 5, 2291986, 1670494, 1678547, 142320, 32, 0, 0, 0, 1, 3, 0, 1, 0, 0, 0, 12, 1, 10, 0 <repeats 260 times>}, keep8= {0, 407769668, 177587312, 0, 0, 0, 0, 0, 31341437, 30351541, 35301388, 41892965, 12567896---Type <return> to continue, or q <return> to quit--- -- /"\ Grassl Andreas \ / ASCII Ribbon Campaign Uni Innsbruck Institut f. Mathematik X against HTML email Technikerstr. 13 Zi 709 / \ +43 (0)512 507 6091