Hi,

I'm running Torque 2.5.11 / Maui 3.3.1 on a small ~200 core cluster with
typical job queues of 10k jobs or so and maui segfaults every week or
two without logging anything relevant (with log level ERROR).

I've got a core dump which I inspected with gdb and it seems to segfault
on the following line at MPBSI.c:3213:

3213              if (J->Req[0]->DRes.Procs == -1)

Short and full backtraces are attached. There is nothing relevant that I
could find in Torque logs around this time...

Maui is compiled with default options, but with the attached patch from
CERN applied, because before I was experiencing lockups every day or
so... and with the patch and increased reservation depth they are mostly
gone.

The configuration file is also attached to this message.

I have checked the svn out, but it seems that no patches have been
committed to it since 3.3.1 release. Also I couldn't find anything
relevant on the lists.

If I can be of any help to further diagnose this issue, please let me
know... I'd appreciate any help in sorting this out.

Thanks,

-- 
Sincerely yours,
Yury V. Zaytsev

#0  0x00000000004a663c in MPBSNodeUpdate (N=0x9686a50, PNode=<value optimized 
out>, NState=<value optimized out>, R=0x6a4a100) at MPBSI.c:3213
        tail = 0x7fff466d6641 "/.de"
        JobID = 
".de\000or13\00005].hat\a\000\000\000\000\000\000\000\005\000\000\000\000\000\000\000\001\000\000\000\000\000\000\0000\016�\t\000\000\000\000�fuF�\177\000\000�\036�\t\000\000\000"
        AP = (struct attrl *) 0x9d59df0
        J = (mjob_t *) 0x9e72a50
        ptr = 0x7fff466d6640 "0/.de"
        tmpBuffer = "0/.de\000 1/1134413[370].hathor.bcfgrid.uni-freiburg.de, 
2/1134413[398].hathor.bcfgrid.uni-freiburg.de, 
3/1134413[420].hathor.bcfgrid.uni-freiburg.de", '\0' <repeats 31390 times>, 
"BFWindow", '\0' <repeats 72 times>, 
"\001\v\002\000\000\000\000\000[ALL]\000T", '\0' <repeats 73 times>, "\004"...
        TokPtr = 0x7fff466d6646 " 
1/1134413[370].hathor.bcfgrid.uni-freiburg.de, 
2/1134413[398].hathor.bcfgrid.uni-freiburg.de, 
3/1134413[420].hathor.bcfgrid.uni-freiburg.de"
        tmpTime = 1338142964
        tmpProcs = <value optimized out>
#1  0x00000000004a7262 in MPBSClusterQuery (R=0x6a4a100, RCount=0x7fff46756930, 
EMsg=<value optimized out>, SC=<value optimized out>) at MPBSI.c:1316
        nodes = (struct batch_status *) 0x9d59ba0
        ErrMsg = <value optimized out>
        Name = 
"hathor13\000\026�\2359\177\000\000\000\000\000\000\031\000\000\000����\005\000\000\000\f\000\000\000\000\000\000\000�8\235\a\000\000\000\000�luF�\177\000\0000�H\000\000\000\000"
        NewState = mnsBusy
        NewNode = 0
        OldState = mnsBusy
        cur_node = (struct batch_status *) 0x9d3cf90
        N = (mnode_t *) 0x9686a50
#2  0x0000000000461667 in __MUTFunc (V=0x7fff467567a0) at MUtil.c:4718
No locals.
#3  0x0000000000463d7f in MUThread (F=<value optimized out>, TimeOut=166144592, 
RC=0x1, ACount=0, Lock=<value optimized out>) at MUtil.c:4691
        rc = 0
        MyLock = 1
        D = {
  Func = 0x4a7050 <MPBSClusterQuery>, 
  Arg = {0x6a4a100, 0x7fff46756930, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
0xffffffffffffffa8, 0x79d389c, 0xb, 0xa}, 
  TimeOut = 127744116, 
  RC = 0x7fff46756934, 
  Lock = 0x7fff46756838
}
        Args = {{
    gp_offset = 48, 
    fp_offset = 0, 
    overflow_arg_area = 0x7fff46756918, 
    reg_save_area = 0x7fff46756840
  }}
#4  0x000000000049b6c7 in MRMClusterQuery (RCount=0x7fff46756978, SC=0x0) at 
MRM.c:493
        rmindex = 0
        rc = 0
        tmpRCount = 23
        TotalRCount = 0
#5  0x000000000049c794 in MRMGetInfo () at MRM.c:352
        J = <value optimized out>
        JobCount = <value optimized out>
        NodeCount = <value optimized out>
        APIFailureCount = 0
#6  0x000000000042d92d in MSchedProcessJobs (OldDay=0x7fff467d8e30 "Sun", 
GlobalSQ=0x7fff467b6e20, GlobalHQ=0x7fff46796e20) at MSched.c:6923
        PIndex = <value optimized out>
        P = <value optimized out>
        tmpQ = {-1, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 
533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 
  548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 
563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 
  579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 
594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 
  610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 
625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 
  641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 
656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 
  672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 
687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 
  703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 
718, 719...}
        CurrentQ = {-1 <repeats 8117 times>, 0 <repeats 23315 times>, 64, 0, 
1182226751, 32767, 1182226768, 32767, 1182226752, 32767, 3, 0, 0, 2, 0, 0, 0, 
  0, 0, 0, 0, 0, 0, 2, 0 <repeats 12 times>, 3, 0, -1641506304, 32569, 0, 0, 0, 
0, 0, 0, 0, 0, -1644591016, 32569, 3, 0, 1182359200, 32767, 4610132, 0, 
  -1644574254, 32569, 156252368, 0, 156300576, 0, 0, 0, 5010726, 0, 0, 0, 
1182359200, 32767, 156252368, 0, 1182228112, 32767, 0 <repeats 40 times>, 6, 0, 
0, 
  2, 0 <repeats 22 times>, 6, 0, -1641506304, 32569, 1182359704, 32767, 
1182358512, 32767, -1633536352, 32569, 1182359704, 32767, -1644591016, 32569, 
6, 0, 
  1182358512, 32767, 1182359704, 32767, -1644574254, 32569, 1182358512, 32767, 
156252368, 0, 1182358512, 32767, 5009296, 0, 1182362016, 32767, 0, 0, 0, 0, 
  5014631, 0, 0, 0, 0, 0, 1182362010, 32767, 0 <repeats 94 times>, 176, 0, 
1182227855, 32767, 1182227872, 32767, 1182227856, 32767, 128, 0, 0, 9, 0, 0, 0, 
  0, 0, 0, 0, 0, 0, 9, 0 <repeats 12 times>...}
        Value = 140734375751200
#7  0x00000000004062c9 in main (ArgC=1, ArgV=<value optimized out>) at 
Server.c:192
        OldDay = "Sun\000\000\000\000\000\005\017�\a", '\0' <repeats 12 times>, 
"\200\210L", '\0' <repeats 13 times>, 
"C*@\000\000\000\000\000\210\217}F�\177\000\000�\210L\000\000\000\000"
        GlobalSQ = {-1, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 
532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 
  547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 
562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 
  578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 
593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 
  609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 
624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 
  640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 
655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 
  671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 
686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 
  702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 
717, 718, 719...}
        GlobalHQ = {-1, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 
532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 
  547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 
562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 
  578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 
593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 
  609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 
624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 
  640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 
655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 
  671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 
686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 
  702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 
717, 718, 719...}
        tmpArgV = {0x0 <repeats 651 times>, 0x7fff467d8300 "", 0x7fff467d8610 
"�\026@", 0x7f399e82026f "H\203�\001H\215\025�(\001", 0x0, 0x0, 0x0, 0x0, 0x0, 
  0x0, 0x0, 0x0, 0x0, 0x168000 <Address 0x168000 out of bounds>, 0x167c50 
<Address 0x167c50 out of bounds>, 0x167c50 <Address 0x167c50 out of bounds>, 
0x0, 
  0x7f399e827525 "\205�u�H\203�\b�\001", 0x368000 <Address 0x368000 out of 
bounds>, 0x7f399ea38528 "", 0x7f399df3160d "ld-linux-x86-64.so.2", 
  0x7f399e820a24 "\205�t�H\201��\003", 0x168000 <Address 0x168000 out of 
bounds>, 0x3 <Address 0x3 out of bounds>, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
  0x1 <Address 0x1 out of bounds>, 0x7fff467d83f0 "", 0x7fff467d8640 
"�\211}F�\177", 0x7f399e82026f "H\203�\001H\215\025�(\001", 0x0, 
  0x5 <Address 0x5 out of bounds>, 0x283000 <Address 0x283000 out of bounds>, 
0x285000 <Address 0x285000 out of bounds>, 
  0x284090 <Address 0x284090 out of bounds>, 0x2840d8 <Address 0x2840d8 out of 
bounds>, 0x83000 <Address 0x83000 out of bounds>, 
  0x3 <Address 0x3 out of bounds>, 0x0, 0x2f000 <Address 0x2f000 out of 
bounds>, 0x2ea74 <Address 0x2ea74 out of bounds>, 
  0x2ea74 <Address 0x2ea74 out of bounds>, 0x0, 0x5 <Address 0x5 out of 
bounds>, 0x22f000 <Address 0x22f000 out of bounds>, 
  0x7f399e827525 "\205�u�H\203�\b�\001", 0x230c60 <Address 0x230c60 out of 
bounds>, 0x7f399ea24000 "", 0x7f399e294e3c "libc.so.6", 
  0x7f399e820a24 "\205�t�H\201��\003", 0x0, 0x100000000 <Address 0x100000000 
out of bounds>, 0x4016f2 "libc.so.6", 0x7f399ea35eb8 "/lib/libc.so.6", 
  0xa <Address 0xa out of bounds>, 0x7f399ea36000 "", 0x7fff467d8610 "�\026@", 
0x0, 0x1 <Address 0x1 out of bounds>, 0x1 <Address 0x1 out of bounds>, 
  0x7f399ea36001 "", 0x7f399e8200dd "�E\020\b\017\204\232\002", 0x7f399ea354c8 
"", 0x0, 0x7fff467d8610 "�\026@", 
  0x7f399e81e110 
"A\211�A\213D\236$\205�\017\2040���\203�\001\017\225�\017��A\t�A\203��\017\2051���H9\035�\234!",
 
  0x726f742f74706f2f <Address 0x726f742f74706f2f out of bounds>, 
0x2e352e322d657571 <Address 0x2e352e322d657571 out of bounds>, 
  0x696c2f62696c2f39 <Address 0x696c2f62696c2f39 out of bounds>, 
0x362e6f732e6362 <Address 0x362e6f732e6362 out of bounds>, 
  0x726f742f74706f2f <Address 0x726f742f74706f2f out of bounds>, 0x7f399e28d298 
"", 0x7fff467d86c8 "\006", 0x7f399e827525 "\205�u�H\203�\b�\001", 
  0x3322e6f73 <Address 0x3322e6f73 out of bounds>, 0x7f399ea24000 "", 
0x7f399e59684e "libc.so.6", 0x7f399e820a24 "\205�t�H\201��\003", 
  0x7fff467d8480 "�T�\2369\177", 0x3467d8680 <Address 0x3467d8680 out of 
bounds>, 0x7f399ea38c80 "\001", 0x7f399ea36318 "�T�\2369\177", 0x0, 
  0xa <Address 0xa out of bounds>, 0x401601 "bs_statjob", 0x0, 0x1 <Address 0x1 
out of bounds>, 0x7f399e289000 "", 0x801 <Address 0x801 out of bounds>, 
  0x3fe9 <Address 0x3fe9 out of bounds>, 0x1 <Address 0x1 out of bounds>, 
0x81ed <Address 0x81ed out of bounds>, 0x0, 0x0, 
  0x16ed30 <Address 0x16ed30 out of bounds>, 0x1000 <Address 0x1000 out of 
bounds>, 0xb80 <Address 0xb80 out of bounds>, 
  0x4cdad872 <Address 0x4cdad872 out of bounds>, 0x0, 0x4cc0eff4 <Address 
0x4cc0eff4 out of bounds>, 0x0, 0x4cdad871 <Address 0x4cdad871 out of bounds>, 
  0x0, 0x0, 0x0, 0x0, 0x300000224 <Address 0x300000224 out of bounds>, 0x3 
<Address 0x3 out of bounds>, 
  0x7f399ea30fd7 <Address 0x7f399ea30fd7 out of bounds>, 0x7f399ea36000 "", 
0x0, 0x7f399ea36001 "", 0x4016f2 "libc.so.6", 
  0x7f399e820b3b "H\211������\017\037D", 0x7f3900000000 <Address 0x7f3900000000 
out of bounds>, 0x7fff467d89c8 "p\217}F�\177", 0x0, 0x7f399ea36001 "", 
  0x7fff467d89df "", 0x7fff467d89d0 "�^�\2369\177", 0x7fff467d8680 "@\003", 
0x0, 0x1 <Address 0x1 out of bounds>, 0xa <Address 0xa out of bounds>, 
  0x7f399ea36000 "", 0x1007fff467d8a00 <Address 0x1007fff467d8a00 out of 
bounds>, 0x340 <Address 0x340 out of bounds>, 
  0x10102464c457f <Address 0x10102464c457f out of bounds>, 0x0, 0x1003e0003 
<Address 0x1003e0003 out of bounds>, 0x1e6d0 <Address 0x1e6d0 out of bounds>, 
  0x40 <Address 0x40 out of bounds>, 0x16dbb0 <Address 0x16dbb0 out of bounds>, 
0x38004000000000 <Address 0x38004000000000 out of bounds>, 
  0x4500460040000a <Address 0x4500460040000a out of bounds>, 0x500000006 
<Address 0x500000006 out of bounds>, 0x40 <Address 0x40 out of bounds>, 
  0x40 <Address 0x40 out of bounds>, 0x40 <Address 0x40 out of bounds>, 
0x7fff467d8cd0 "\210\216}F�\177", 0x7fff467d8d40 ".N=�", 
  0x10 <Address 0x10 out of bounds>, 0x7f399e287b40 "\001", 0x0, 0x7f399ea24000 
"", 0x7f399e8246ed "H\211C H\203�\020[�f\017\037\204", 0x0, 
  0x1c <Address 0x1c out of bounds>, 0x0, 0x7f399e8267d6 "H\213\204$\030\001", 
0x7fff467d8cd0 "\210\216}F�\177", 0x7f399e8246b0 "SH\211��\001", 
  0x7fff467d8d0f "", 0x7fff467d8d00 "\020��\2359\177", 0x7fff467d8cf8 "�\032@", 
0x7f399ea38b38 "", 0x600000001 <Address 0x600000001 out of bounds>, 0x0, 
  0x368728 <Address 0x368728 out of bounds>, 0x7f399df3160d 
"ld-linux-x86-64.so.2", 0xfffe8cfb1a800000 <Address 0xfffe8cfb1a800000 out of 
bounds>, 
  0x10 <Address 0x10 out of bounds>, 0x7f399e287b40 "\001", 0x0, 0x7f399ea24000 
"", 0xfffe8cfb0e800000 <Address 0xfffe8cfb0e800000 out of bounds>, 
  0xfe733d04cf760000 <Address 0xfe733d04cf760000 out of bounds>, 0x1e0 <Address 
0x1e0 out of bounds>, 0x1e0 <Address 0x1e0 out of bounds>, 
  0x8 <Address 0x8 out of bounds>, 0x400000004 <Address 0x400000004 out of 
bounds>, 0x7fff467d8cd0 "\210\216}F�\177", 0x7fff467d8d40 ".N=�", 
  0x10 <Address 0x10 out of bounds>, 0x7f399e4bdc88 "\001", 0x0, 0x7f399ea359f8 
"", 0x7f399e8246ed "H\211C H\203�\020[�f\017\037\204", 0x0, 
  0x2 <Address 0x2 out of bounds>, 0x7f399ea24520 "", 0x7f399e8257fc 
"D\213\215����L\213\205����A\203�\001\017\206\035����V���H\213\225\020���\203��\002",
 
  0x7fff00000001 <Address 0x7fff00000001 out of bounds>, 0x7f399ea38528 "", 
0x0, 0x0, 0x7f399df3160d "ld-linux-x86-64.so.2"...}
        aindex = <value optimized out>
#0  0x00000000004a663c in MPBSNodeUpdate (N=0x9686a50, PNode=<value optimized 
out>, NState=<value optimized out>, R=0x6a4a100) at MPBSI.c:3213
#1  0x00000000004a7262 in MPBSClusterQuery (R=0x6a4a100, RCount=0x7fff46756930, 
EMsg=<value optimized out>, SC=<value optimized out>) at MPBSI.c:1316
#2  0x0000000000461667 in __MUTFunc (V=0x7fff467567a0) at MUtil.c:4718
#3  0x0000000000463d7f in MUThread (F=<value optimized out>, TimeOut=166144592, 
RC=0x1, ACount=0, Lock=<value optimized out>) at MUtil.c:4691
#4  0x000000000049b6c7 in MRMClusterQuery (RCount=0x7fff46756978, SC=0x0) at 
MRM.c:493
#5  0x000000000049c794 in MRMGetInfo () at MRM.c:352
#6  0x000000000042d92d in MSchedProcessJobs (OldDay=0x7fff467d8e30 "Sun", 
GlobalSQ=0x7fff467b6e20, GlobalHQ=0x7fff46796e20) at MSched.c:6923
#7  0x00000000004062c9 in main (ArgC=1, ArgV=<value optimized out>) at 
Server.c:192
# Incresase the length of diagnose -f.

===================================================================
RCS file: maui-3.2.6p20/include/RCS/msched-common.h,v
retrieving revision 1.1
diff -u -r1.1 maui-3.2.6p20/include/msched-common.h
--- maui-3.2.6p20/include/msched-common.h	2008/09/04 11:45:37	1.1
+++ maui-3.2.6p20/include/msched-common.h	2008/09/04 11:46:31
@@ -150,14 +150,14 @@
 #endif /* DEFAULT */
 
 #define MAX_PATH_LEN        256
-#define MAX_SBUFFER      262144
+#define MAX_SBUFFER     2097152
 
 #define MAX_MPATH_LEN       256
 #define MMAX_PATH_LEN       256
 
 #define MAX_MLINE          1024
 #define MMAX_LINE          1024
-#define MAX_MBUFFER       65536
+#define MAX_MBUFFER      524288
 #define MMAX_BUFFER       65536
 #define MAX_MSBUFFER     262144
 #define MAX_MENVVAR          64
diff -uNr maui-3.2.6p9.orig/include/msched-common.h maui-3.2.6p9/include/msched-common.h
--- maui-3.2.6p9.orig/include/msched-common.h	2004-12-03 13:55:22.000000000 +0000
+++ maui-3.2.6p9/include/msched-common.h	2004-12-03 13:56:24.000000000 +0000
@@ -368,7 +368,7 @@
   unsigned long InitialWorkload;
   } mrclass_t;
 
-#define MAX_MCLASS  16            
+#define MAX_MCLASS  64           
 #define MMAX_CLASS  16
 
 #define MAX_MGRES    4    
diff -uNr maui-3.2.6p14.ORIG/include/msched.h maui-3.2.6p14/include/msched.h
--- maui-3.2.6p14.ORIG/include/msched.h	2006-02-15 10:17:16.000000000 +0000
+++ maui-3.2.6p14/include/msched.h	2006-02-15 10:20:13.000000000 +0000
@@ -440,11 +440,11 @@
 
 #define MAX_VAL      2140000000
 
-#define MMAX_JOB           4096
+#define MMAX_JOB          32768
 #define MAX_MJOB       MMAX_JOB
 
 
-#define MAX_MJOB_TRACE     4096
+#define MAX_MJOB_TRACE    32768
 
 #define MAX_TASK_REQUESTS    32
 #define MAX_REQ_TYPE         16
diff -uNr maui-3.2.6p14.ORIG/include/moab.h maui-3.2.6p14/include/moab.h
--- maui-3.2.6p14.ORIG/include/moab.h	2006-05-04 17:37:38.000000000 +0100
+++ maui-3.2.6p14/include/moab.h	2006-05-04 17:41:38.000000000 +0100
@@ -310,7 +310,7 @@
 #endif /* MAX_MATTR */
 
 #ifndef MAX_MRES
-# define MAX_MRES          1024
+# define MAX_MRES          8192
 #endif /* MAX_MRES */
 
 #ifndef MMAX_RSV
Patch to increase the maxium number of standing reservations.


diff -uNr maui-3.2.6p20.ORIG/include/msched.h maui-3.2.6p20/include/msched.h
--- maui-3.2.6p20.ORIG/include/msched.h	2008-03-17 13:40:14.000000000 +0100
+++ maui-3.2.6p20/include/msched.h	2008-03-17 13:41:38.000000000 +0100
@@ -459,7 +459,7 @@
 
 #define MAX_MFRAMECOUNT      64
 #define MAX_MSLOTPERFRAME    32
-#define MAX_MSRES           128
+#define MAX_MSRES          1024
 #define MAX_SRES_DEPTH       64
 #define MAX_MRANGE          256
 #define MAX_PRIO_VAL 1000000000
# ZYV

# hostname of machine on which maui will run. NOTE: this parameter MUST be 
specified.

SERVERHOST            hathor

# primary admin must be first in list

ADMIN1                root

# Resource Manager Definition

RMCFG[hathor]           TYPE=PBS
RMPOLLINTERVAL          00:00:30

# Allocation Manager Definition

AMCFG[bank]  TYPE=NONE

# full parameter docs at http://supercluster.org/mauidocs/a.fparameters.html
# use the 'schedctl -l' command to display current configuration

SERVERPORT            42559
SERVERMODE            NORMAL

# Admin: http://supercluster.org/mauidocs/a.esecurity.html

LOGFILE               maui.log
LOGFILEMAXSIZE        50000000
LOGLEVEL              0
LOGFILEROLLDEPTH      3

# ZYV

# Set the JOBNODEMATCHPOLICY parameter to EXACTNODE to have Maui support
# PBS's default allocation behavior of <NODECOUNT> nodes with exactly <X>
# tasks per node.

JOBNODEMATCHPOLICY EXACTNODE

# Job Priority: http://supercluster.org/mauidocs/5.1jobprioritization.html

QUEUETIMEWEIGHT       1

# FairShare: http://supercluster.org/mauidocs/6.3fairshare.html

#FSPOLICY              PSDEDICATED
#FSDEPTH               7
#FSINTERVAL            86400
#FSDECAY               0.80

# BW 20110316 : enable fairshare
FSPOLICY              DEDICATEDPS
FSDEPTH               14
FSINTERVAL            86400
FSDECAY               0.80

FSWEIGHT              500
FSUSERWEIGHT          8
USERCFG[DEFAULT]      FSTARGET=25.0

# Throttling Policies: 
http://supercluster.org/mauidocs/6.2throttlingpolicies.html

# NONE SPECIFIED

# Backfill: http://supercluster.org/mauidocs/8.2backfill.html

#BACKFILLPOLICY        FIRSTFIT

BACKFILLPOLICY        BESTFIT
RESERVATIONPOLICY     CURRENTHIGHEST

# We have nodes with up to 32 CPUs, default is 24
RESDEPTH              64

# Node Allocation: http://supercluster.org/mauidocs/5.2nodeallocation.html

NODEALLOCATIONPOLICY  MINRESOURCE

# QOS: http://supercluster.org/mauidocs/7.3qos.html

# QOSCFG[hi]  PRIORITY=100 XFTARGET=100 FLAGS=PREEMPTOR:IGNMAXJOB
# QOSCFG[low] PRIORITY=-1000 FLAGS=PREEMPTEE

# Standing Reservations: 
http://supercluster.org/mauidocs/7.1.3standingreservations.html

# SRSTARTTIME[test] 8:00:00
# SRENDTIME[test]   17:00:00
# SRDAYS[test]      MON TUE WED THU FRI
# SRTASKCOUNT[test] 20
# SRMAXTIME[test]   0:30:00

# Creds: http://supercluster.org/mauidocs/6.1fairnessoverview.html

# USERCFG[DEFAULT]      FSTARGET=25.0
# USERCFG[john]         PRIORITY=100  FSTARGET=10.0-
# GROUPCFG[staff]       PRIORITY=1000 QLIST=hi:low QDEF=hi
# CLASSCFG[batch]       FLAGS=PREEMPTEE
# CLASSCFG[interactive] FLAGS=PREEMPTOR

# ZYV: mem/swap limits enforcement

ENFORCERESOURCELIMITS   ON
RESOURCELIMITPOLICY     MEM:ALWAYS:CANCEL
RESOURCELIMITPOLICY     SWAP:ALWAYS:CANCEL
_______________________________________________
mauiusers mailing list
[email protected]
http://www.supercluster.org/mailman/listinfo/mauiusers

Reply via email to