Hi,
I'm running Torque 2.5.11 / Maui 3.3.1 on a small ~200 core cluster with
typical job queues of 10k jobs or so and maui segfaults every week or
two without logging anything relevant (with log level ERROR).
I've got a core dump which I inspected with gdb and it seems to segfault
on the following line at MPBSI.c:3213:
3213 if (J->Req[0]->DRes.Procs == -1)
Short and full backtraces are attached. There is nothing relevant that I
could find in Torque logs around this time...
Maui is compiled with default options, but with the attached patch from
CERN applied, because before I was experiencing lockups every day or
so... and with the patch and increased reservation depth they are mostly
gone.
The configuration file is also attached to this message.
I have checked the svn out, but it seems that no patches have been
committed to it since 3.3.1 release. Also I couldn't find anything
relevant on the lists.
If I can be of any help to further diagnose this issue, please let me
know... I'd appreciate any help in sorting this out.
Thanks,
--
Sincerely yours,
Yury V. Zaytsev
#0 0x00000000004a663c in MPBSNodeUpdate (N=0x9686a50, PNode=<value optimized
out>, NState=<value optimized out>, R=0x6a4a100) at MPBSI.c:3213
tail = 0x7fff466d6641 "/.de"
JobID =
".de\000or13\00005].hat\a\000\000\000\000\000\000\000\005\000\000\000\000\000\000\000\001\000\000\000\000\000\000\0000\016�\t\000\000\000\000�fuF�\177\000\000�\036�\t\000\000\000"
AP = (struct attrl *) 0x9d59df0
J = (mjob_t *) 0x9e72a50
ptr = 0x7fff466d6640 "0/.de"
tmpBuffer = "0/.de\000 1/1134413[370].hathor.bcfgrid.uni-freiburg.de,
2/1134413[398].hathor.bcfgrid.uni-freiburg.de,
3/1134413[420].hathor.bcfgrid.uni-freiburg.de", '\0' <repeats 31390 times>,
"BFWindow", '\0' <repeats 72 times>,
"\001\v\002\000\000\000\000\000[ALL]\000T", '\0' <repeats 73 times>, "\004"...
TokPtr = 0x7fff466d6646 "
1/1134413[370].hathor.bcfgrid.uni-freiburg.de,
2/1134413[398].hathor.bcfgrid.uni-freiburg.de,
3/1134413[420].hathor.bcfgrid.uni-freiburg.de"
tmpTime = 1338142964
tmpProcs = <value optimized out>
#1 0x00000000004a7262 in MPBSClusterQuery (R=0x6a4a100, RCount=0x7fff46756930,
EMsg=<value optimized out>, SC=<value optimized out>) at MPBSI.c:1316
nodes = (struct batch_status *) 0x9d59ba0
ErrMsg = <value optimized out>
Name =
"hathor13\000\026�\2359\177\000\000\000\000\000\000\031\000\000\000����\005\000\000\000\f\000\000\000\000\000\000\000�8\235\a\000\000\000\000�luF�\177\000\0000�H\000\000\000\000"
NewState = mnsBusy
NewNode = 0
OldState = mnsBusy
cur_node = (struct batch_status *) 0x9d3cf90
N = (mnode_t *) 0x9686a50
#2 0x0000000000461667 in __MUTFunc (V=0x7fff467567a0) at MUtil.c:4718
No locals.
#3 0x0000000000463d7f in MUThread (F=<value optimized out>, TimeOut=166144592,
RC=0x1, ACount=0, Lock=<value optimized out>) at MUtil.c:4691
rc = 0
MyLock = 1
D = {
Func = 0x4a7050 <MPBSClusterQuery>,
Arg = {0x6a4a100, 0x7fff46756930, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0xffffffffffffffa8, 0x79d389c, 0xb, 0xa},
TimeOut = 127744116,
RC = 0x7fff46756934,
Lock = 0x7fff46756838
}
Args = {{
gp_offset = 48,
fp_offset = 0,
overflow_arg_area = 0x7fff46756918,
reg_save_area = 0x7fff46756840
}}
#4 0x000000000049b6c7 in MRMClusterQuery (RCount=0x7fff46756978, SC=0x0) at
MRM.c:493
rmindex = 0
rc = 0
tmpRCount = 23
TotalRCount = 0
#5 0x000000000049c794 in MRMGetInfo () at MRM.c:352
J = <value optimized out>
JobCount = <value optimized out>
NodeCount = <value optimized out>
APIFailureCount = 0
#6 0x000000000042d92d in MSchedProcessJobs (OldDay=0x7fff467d8e30 "Sun",
GlobalSQ=0x7fff467b6e20, GlobalHQ=0x7fff46796e20) at MSched.c:6923
PIndex = <value optimized out>
P = <value optimized out>
tmpQ = {-1, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532,
533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547,
548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562,
563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578,
579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593,
594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609,
610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624,
625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640,
641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655,
656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671,
672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686,
687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702,
703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717,
718, 719...}
CurrentQ = {-1 <repeats 8117 times>, 0 <repeats 23315 times>, 64, 0,
1182226751, 32767, 1182226768, 32767, 1182226752, 32767, 3, 0, 0, 2, 0, 0, 0,
0, 0, 0, 0, 0, 0, 2, 0 <repeats 12 times>, 3, 0, -1641506304, 32569, 0, 0, 0,
0, 0, 0, 0, 0, -1644591016, 32569, 3, 0, 1182359200, 32767, 4610132, 0,
-1644574254, 32569, 156252368, 0, 156300576, 0, 0, 0, 5010726, 0, 0, 0,
1182359200, 32767, 156252368, 0, 1182228112, 32767, 0 <repeats 40 times>, 6, 0,
0,
2, 0 <repeats 22 times>, 6, 0, -1641506304, 32569, 1182359704, 32767,
1182358512, 32767, -1633536352, 32569, 1182359704, 32767, -1644591016, 32569,
6, 0,
1182358512, 32767, 1182359704, 32767, -1644574254, 32569, 1182358512, 32767,
156252368, 0, 1182358512, 32767, 5009296, 0, 1182362016, 32767, 0, 0, 0, 0,
5014631, 0, 0, 0, 0, 0, 1182362010, 32767, 0 <repeats 94 times>, 176, 0,
1182227855, 32767, 1182227872, 32767, 1182227856, 32767, 128, 0, 0, 9, 0, 0, 0,
0, 0, 0, 0, 0, 0, 9, 0 <repeats 12 times>...}
Value = 140734375751200
#7 0x00000000004062c9 in main (ArgC=1, ArgV=<value optimized out>) at
Server.c:192
OldDay = "Sun\000\000\000\000\000\005\017�\a", '\0' <repeats 12 times>,
"\200\210L", '\0' <repeats 13 times>,
"C*@\000\000\000\000\000\210\217}F�\177\000\000�\210L\000\000\000\000"
GlobalSQ = {-1, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531,
532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546,
547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561,
562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577,
578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592,
593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608,
609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623,
624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639,
640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654,
655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670,
671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685,
686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701,
702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716,
717, 718, 719...}
GlobalHQ = {-1, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531,
532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546,
547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561,
562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577,
578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592,
593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608,
609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623,
624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639,
640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654,
655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670,
671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685,
686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701,
702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716,
717, 718, 719...}
tmpArgV = {0x0 <repeats 651 times>, 0x7fff467d8300 "", 0x7fff467d8610
"�\026@", 0x7f399e82026f "H\203�\001H\215\025�(\001", 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x168000 <Address 0x168000 out of bounds>, 0x167c50
<Address 0x167c50 out of bounds>, 0x167c50 <Address 0x167c50 out of bounds>,
0x0,
0x7f399e827525 "\205�u�H\203�\b�\001", 0x368000 <Address 0x368000 out of
bounds>, 0x7f399ea38528 "", 0x7f399df3160d "ld-linux-x86-64.so.2",
0x7f399e820a24 "\205�t�H\201��\003", 0x168000 <Address 0x168000 out of
bounds>, 0x3 <Address 0x3 out of bounds>, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x1 <Address 0x1 out of bounds>, 0x7fff467d83f0 "", 0x7fff467d8640
"�\211}F�\177", 0x7f399e82026f "H\203�\001H\215\025�(\001", 0x0,
0x5 <Address 0x5 out of bounds>, 0x283000 <Address 0x283000 out of bounds>,
0x285000 <Address 0x285000 out of bounds>,
0x284090 <Address 0x284090 out of bounds>, 0x2840d8 <Address 0x2840d8 out of
bounds>, 0x83000 <Address 0x83000 out of bounds>,
0x3 <Address 0x3 out of bounds>, 0x0, 0x2f000 <Address 0x2f000 out of
bounds>, 0x2ea74 <Address 0x2ea74 out of bounds>,
0x2ea74 <Address 0x2ea74 out of bounds>, 0x0, 0x5 <Address 0x5 out of
bounds>, 0x22f000 <Address 0x22f000 out of bounds>,
0x7f399e827525 "\205�u�H\203�\b�\001", 0x230c60 <Address 0x230c60 out of
bounds>, 0x7f399ea24000 "", 0x7f399e294e3c "libc.so.6",
0x7f399e820a24 "\205�t�H\201��\003", 0x0, 0x100000000 <Address 0x100000000
out of bounds>, 0x4016f2 "libc.so.6", 0x7f399ea35eb8 "/lib/libc.so.6",
0xa <Address 0xa out of bounds>, 0x7f399ea36000 "", 0x7fff467d8610 "�\026@",
0x0, 0x1 <Address 0x1 out of bounds>, 0x1 <Address 0x1 out of bounds>,
0x7f399ea36001 "", 0x7f399e8200dd "�E\020\b\017\204\232\002", 0x7f399ea354c8
"", 0x0, 0x7fff467d8610 "�\026@",
0x7f399e81e110
"A\211�A\213D\236$\205�\017\2040���\203�\001\017\225�\017��A\t�A\203��\017\2051���H9\035�\234!",
0x726f742f74706f2f <Address 0x726f742f74706f2f out of bounds>,
0x2e352e322d657571 <Address 0x2e352e322d657571 out of bounds>,
0x696c2f62696c2f39 <Address 0x696c2f62696c2f39 out of bounds>,
0x362e6f732e6362 <Address 0x362e6f732e6362 out of bounds>,
0x726f742f74706f2f <Address 0x726f742f74706f2f out of bounds>, 0x7f399e28d298
"", 0x7fff467d86c8 "\006", 0x7f399e827525 "\205�u�H\203�\b�\001",
0x3322e6f73 <Address 0x3322e6f73 out of bounds>, 0x7f399ea24000 "",
0x7f399e59684e "libc.so.6", 0x7f399e820a24 "\205�t�H\201��\003",
0x7fff467d8480 "�T�\2369\177", 0x3467d8680 <Address 0x3467d8680 out of
bounds>, 0x7f399ea38c80 "\001", 0x7f399ea36318 "�T�\2369\177", 0x0,
0xa <Address 0xa out of bounds>, 0x401601 "bs_statjob", 0x0, 0x1 <Address 0x1
out of bounds>, 0x7f399e289000 "", 0x801 <Address 0x801 out of bounds>,
0x3fe9 <Address 0x3fe9 out of bounds>, 0x1 <Address 0x1 out of bounds>,
0x81ed <Address 0x81ed out of bounds>, 0x0, 0x0,
0x16ed30 <Address 0x16ed30 out of bounds>, 0x1000 <Address 0x1000 out of
bounds>, 0xb80 <Address 0xb80 out of bounds>,
0x4cdad872 <Address 0x4cdad872 out of bounds>, 0x0, 0x4cc0eff4 <Address
0x4cc0eff4 out of bounds>, 0x0, 0x4cdad871 <Address 0x4cdad871 out of bounds>,
0x0, 0x0, 0x0, 0x0, 0x300000224 <Address 0x300000224 out of bounds>, 0x3
<Address 0x3 out of bounds>,
0x7f399ea30fd7 <Address 0x7f399ea30fd7 out of bounds>, 0x7f399ea36000 "",
0x0, 0x7f399ea36001 "", 0x4016f2 "libc.so.6",
0x7f399e820b3b "H\211������\017\037D", 0x7f3900000000 <Address 0x7f3900000000
out of bounds>, 0x7fff467d89c8 "p\217}F�\177", 0x0, 0x7f399ea36001 "",
0x7fff467d89df "", 0x7fff467d89d0 "�^�\2369\177", 0x7fff467d8680 "@\003",
0x0, 0x1 <Address 0x1 out of bounds>, 0xa <Address 0xa out of bounds>,
0x7f399ea36000 "", 0x1007fff467d8a00 <Address 0x1007fff467d8a00 out of
bounds>, 0x340 <Address 0x340 out of bounds>,
0x10102464c457f <Address 0x10102464c457f out of bounds>, 0x0, 0x1003e0003
<Address 0x1003e0003 out of bounds>, 0x1e6d0 <Address 0x1e6d0 out of bounds>,
0x40 <Address 0x40 out of bounds>, 0x16dbb0 <Address 0x16dbb0 out of bounds>,
0x38004000000000 <Address 0x38004000000000 out of bounds>,
0x4500460040000a <Address 0x4500460040000a out of bounds>, 0x500000006
<Address 0x500000006 out of bounds>, 0x40 <Address 0x40 out of bounds>,
0x40 <Address 0x40 out of bounds>, 0x40 <Address 0x40 out of bounds>,
0x7fff467d8cd0 "\210\216}F�\177", 0x7fff467d8d40 ".N=�",
0x10 <Address 0x10 out of bounds>, 0x7f399e287b40 "\001", 0x0, 0x7f399ea24000
"", 0x7f399e8246ed "H\211C H\203�\020[�f\017\037\204", 0x0,
0x1c <Address 0x1c out of bounds>, 0x0, 0x7f399e8267d6 "H\213\204$\030\001",
0x7fff467d8cd0 "\210\216}F�\177", 0x7f399e8246b0 "SH\211��\001",
0x7fff467d8d0f "", 0x7fff467d8d00 "\020��\2359\177", 0x7fff467d8cf8 "�\032@",
0x7f399ea38b38 "", 0x600000001 <Address 0x600000001 out of bounds>, 0x0,
0x368728 <Address 0x368728 out of bounds>, 0x7f399df3160d
"ld-linux-x86-64.so.2", 0xfffe8cfb1a800000 <Address 0xfffe8cfb1a800000 out of
bounds>,
0x10 <Address 0x10 out of bounds>, 0x7f399e287b40 "\001", 0x0, 0x7f399ea24000
"", 0xfffe8cfb0e800000 <Address 0xfffe8cfb0e800000 out of bounds>,
0xfe733d04cf760000 <Address 0xfe733d04cf760000 out of bounds>, 0x1e0 <Address
0x1e0 out of bounds>, 0x1e0 <Address 0x1e0 out of bounds>,
0x8 <Address 0x8 out of bounds>, 0x400000004 <Address 0x400000004 out of
bounds>, 0x7fff467d8cd0 "\210\216}F�\177", 0x7fff467d8d40 ".N=�",
0x10 <Address 0x10 out of bounds>, 0x7f399e4bdc88 "\001", 0x0, 0x7f399ea359f8
"", 0x7f399e8246ed "H\211C H\203�\020[�f\017\037\204", 0x0,
0x2 <Address 0x2 out of bounds>, 0x7f399ea24520 "", 0x7f399e8257fc
"D\213\215����L\213\205����A\203�\001\017\206\035����V���H\213\225\020���\203��\002",
0x7fff00000001 <Address 0x7fff00000001 out of bounds>, 0x7f399ea38528 "",
0x0, 0x0, 0x7f399df3160d "ld-linux-x86-64.so.2"...}
aindex = <value optimized out>
#0 0x00000000004a663c in MPBSNodeUpdate (N=0x9686a50, PNode=<value optimized
out>, NState=<value optimized out>, R=0x6a4a100) at MPBSI.c:3213
#1 0x00000000004a7262 in MPBSClusterQuery (R=0x6a4a100, RCount=0x7fff46756930,
EMsg=<value optimized out>, SC=<value optimized out>) at MPBSI.c:1316
#2 0x0000000000461667 in __MUTFunc (V=0x7fff467567a0) at MUtil.c:4718
#3 0x0000000000463d7f in MUThread (F=<value optimized out>, TimeOut=166144592,
RC=0x1, ACount=0, Lock=<value optimized out>) at MUtil.c:4691
#4 0x000000000049b6c7 in MRMClusterQuery (RCount=0x7fff46756978, SC=0x0) at
MRM.c:493
#5 0x000000000049c794 in MRMGetInfo () at MRM.c:352
#6 0x000000000042d92d in MSchedProcessJobs (OldDay=0x7fff467d8e30 "Sun",
GlobalSQ=0x7fff467b6e20, GlobalHQ=0x7fff46796e20) at MSched.c:6923
#7 0x00000000004062c9 in main (ArgC=1, ArgV=<value optimized out>) at
Server.c:192
# Incresase the length of diagnose -f.
===================================================================
RCS file: maui-3.2.6p20/include/RCS/msched-common.h,v
retrieving revision 1.1
diff -u -r1.1 maui-3.2.6p20/include/msched-common.h
--- maui-3.2.6p20/include/msched-common.h 2008/09/04 11:45:37 1.1
+++ maui-3.2.6p20/include/msched-common.h 2008/09/04 11:46:31
@@ -150,14 +150,14 @@
#endif /* DEFAULT */
#define MAX_PATH_LEN 256
-#define MAX_SBUFFER 262144
+#define MAX_SBUFFER 2097152
#define MAX_MPATH_LEN 256
#define MMAX_PATH_LEN 256
#define MAX_MLINE 1024
#define MMAX_LINE 1024
-#define MAX_MBUFFER 65536
+#define MAX_MBUFFER 524288
#define MMAX_BUFFER 65536
#define MAX_MSBUFFER 262144
#define MAX_MENVVAR 64
diff -uNr maui-3.2.6p9.orig/include/msched-common.h maui-3.2.6p9/include/msched-common.h
--- maui-3.2.6p9.orig/include/msched-common.h 2004-12-03 13:55:22.000000000 +0000
+++ maui-3.2.6p9/include/msched-common.h 2004-12-03 13:56:24.000000000 +0000
@@ -368,7 +368,7 @@
unsigned long InitialWorkload;
} mrclass_t;
-#define MAX_MCLASS 16
+#define MAX_MCLASS 64
#define MMAX_CLASS 16
#define MAX_MGRES 4
diff -uNr maui-3.2.6p14.ORIG/include/msched.h maui-3.2.6p14/include/msched.h
--- maui-3.2.6p14.ORIG/include/msched.h 2006-02-15 10:17:16.000000000 +0000
+++ maui-3.2.6p14/include/msched.h 2006-02-15 10:20:13.000000000 +0000
@@ -440,11 +440,11 @@
#define MAX_VAL 2140000000
-#define MMAX_JOB 4096
+#define MMAX_JOB 32768
#define MAX_MJOB MMAX_JOB
-#define MAX_MJOB_TRACE 4096
+#define MAX_MJOB_TRACE 32768
#define MAX_TASK_REQUESTS 32
#define MAX_REQ_TYPE 16
diff -uNr maui-3.2.6p14.ORIG/include/moab.h maui-3.2.6p14/include/moab.h
--- maui-3.2.6p14.ORIG/include/moab.h 2006-05-04 17:37:38.000000000 +0100
+++ maui-3.2.6p14/include/moab.h 2006-05-04 17:41:38.000000000 +0100
@@ -310,7 +310,7 @@
#endif /* MAX_MATTR */
#ifndef MAX_MRES
-# define MAX_MRES 1024
+# define MAX_MRES 8192
#endif /* MAX_MRES */
#ifndef MMAX_RSV
Patch to increase the maxium number of standing reservations.
diff -uNr maui-3.2.6p20.ORIG/include/msched.h maui-3.2.6p20/include/msched.h
--- maui-3.2.6p20.ORIG/include/msched.h 2008-03-17 13:40:14.000000000 +0100
+++ maui-3.2.6p20/include/msched.h 2008-03-17 13:41:38.000000000 +0100
@@ -459,7 +459,7 @@
#define MAX_MFRAMECOUNT 64
#define MAX_MSLOTPERFRAME 32
-#define MAX_MSRES 128
+#define MAX_MSRES 1024
#define MAX_SRES_DEPTH 64
#define MAX_MRANGE 256
#define MAX_PRIO_VAL 1000000000
# ZYV
# hostname of machine on which maui will run. NOTE: this parameter MUST be
specified.
SERVERHOST hathor
# primary admin must be first in list
ADMIN1 root
# Resource Manager Definition
RMCFG[hathor] TYPE=PBS
RMPOLLINTERVAL 00:00:30
# Allocation Manager Definition
AMCFG[bank] TYPE=NONE
# full parameter docs at http://supercluster.org/mauidocs/a.fparameters.html
# use the 'schedctl -l' command to display current configuration
SERVERPORT 42559
SERVERMODE NORMAL
# Admin: http://supercluster.org/mauidocs/a.esecurity.html
LOGFILE maui.log
LOGFILEMAXSIZE 50000000
LOGLEVEL 0
LOGFILEROLLDEPTH 3
# ZYV
# Set the JOBNODEMATCHPOLICY parameter to EXACTNODE to have Maui support
# PBS's default allocation behavior of <NODECOUNT> nodes with exactly <X>
# tasks per node.
JOBNODEMATCHPOLICY EXACTNODE
# Job Priority: http://supercluster.org/mauidocs/5.1jobprioritization.html
QUEUETIMEWEIGHT 1
# FairShare: http://supercluster.org/mauidocs/6.3fairshare.html
#FSPOLICY PSDEDICATED
#FSDEPTH 7
#FSINTERVAL 86400
#FSDECAY 0.80
# BW 20110316 : enable fairshare
FSPOLICY DEDICATEDPS
FSDEPTH 14
FSINTERVAL 86400
FSDECAY 0.80
FSWEIGHT 500
FSUSERWEIGHT 8
USERCFG[DEFAULT] FSTARGET=25.0
# Throttling Policies:
http://supercluster.org/mauidocs/6.2throttlingpolicies.html
# NONE SPECIFIED
# Backfill: http://supercluster.org/mauidocs/8.2backfill.html
#BACKFILLPOLICY FIRSTFIT
BACKFILLPOLICY BESTFIT
RESERVATIONPOLICY CURRENTHIGHEST
# We have nodes with up to 32 CPUs, default is 24
RESDEPTH 64
# Node Allocation: http://supercluster.org/mauidocs/5.2nodeallocation.html
NODEALLOCATIONPOLICY MINRESOURCE
# QOS: http://supercluster.org/mauidocs/7.3qos.html
# QOSCFG[hi] PRIORITY=100 XFTARGET=100 FLAGS=PREEMPTOR:IGNMAXJOB
# QOSCFG[low] PRIORITY=-1000 FLAGS=PREEMPTEE
# Standing Reservations:
http://supercluster.org/mauidocs/7.1.3standingreservations.html
# SRSTARTTIME[test] 8:00:00
# SRENDTIME[test] 17:00:00
# SRDAYS[test] MON TUE WED THU FRI
# SRTASKCOUNT[test] 20
# SRMAXTIME[test] 0:30:00
# Creds: http://supercluster.org/mauidocs/6.1fairnessoverview.html
# USERCFG[DEFAULT] FSTARGET=25.0
# USERCFG[john] PRIORITY=100 FSTARGET=10.0-
# GROUPCFG[staff] PRIORITY=1000 QLIST=hi:low QDEF=hi
# CLASSCFG[batch] FLAGS=PREEMPTEE
# CLASSCFG[interactive] FLAGS=PREEMPTOR
# ZYV: mem/swap limits enforcement
ENFORCERESOURCELIMITS ON
RESOURCELIMITPOLICY MEM:ALWAYS:CANCEL
RESOURCELIMITPOLICY SWAP:ALWAYS:CANCEL
_______________________________________________
mauiusers mailing list
[email protected]
http://www.supercluster.org/mailman/listinfo/mauiusers