Josh, good day.

Thu, Nov 22, 2007 at 02:48:53PM +0300, Eygene Ryabinkin wrote:
> Wed, Aug 15, 2007 at 09:45:45PM +0400, Eygene Ryabinkin wrote:
> > Wed, Aug 15, 2007 at 11:29:44AM -0600, Josh Butikofer wrote:
> > > Because this patch changes some fundamental Maui scheduling code,
> > > one of the other developers here wants to do a thorough review
> > > of the changes you've proposed. He says that this will probably occur
> > > before the end of the month. Sorry for the delay. We are confident
> > > that your changes will have no negative side-effects for the community,
> > > but we need to make sure it is the best direction for the
> > > code. I hope you can understand our position.
> > 
> > Yeah, sure -- the review will be very good, so I am for it: I could
> > overlook something or took the suboptimal path.  So I am understanding
> > your position and supporting it ;))
> > 
> > > I'll keep you posted.
> 
> Any news on the patch I coined?

Had not heard anything about my (old) patch for the default partition
handling.  I had ported it to the patchlevel 20, since it was not
applied cleanly to some snapshot.  The attached files are the new
patches that are applying cleanly to the latest snapshot,
maui-3.2.6p20-snap.12126171145.

So, what is the status of this patch?  Was it reviewed?  It works
for almost a year at our cluster, if someone is interested ;))
-- 
Eygene Ryabinkin, Russian Research Centre "Kurchatov Institute"
>From cc6f8495844e95bd45fe4e303f1fed3e663f155f Mon Sep 17 00:00:00 2001
From: Eygene Ryabinkin <[EMAIL PROTECTED]>
Date: Tue, 20 Mar 2007 15:59:25 +0300
Subject: [PATCH] Prepare the MQueueSelectJobs() for the two-pass scheduling.

In order to get the 'PDEF' statement to work correcly we should get
two-pass scheduling for the given partition: first, the jobs for
which the partition is the default should be considered and then
the rest of the jobs should be considered for scheduling. Note that
we should walk over all partitions at the first pass and only then
the second pass over all partitions and the rest of the jobs must
be done: we should select ALL jobs that can fit to their default
partitions.

The current patch is a no-op from the functional point of view.
It just encapsulates the single job checking to the local function
MQueueCheckSingleJob() that was taken from the original body of the
MQueueSelectJobs().

Patch was tested on the RRC-KI Grid cluster and yet showed no
regressions on its daily operations.

Signed-off-by: Eygene Ryabinkin <[EMAIL PROTECTED]>
---
 src/moab/MPolicy.c |  582 ++++++++++++++++++++++++++++------------------------
 1 files changed, 310 insertions(+), 272 deletions(-)

diff --git a/src/moab/MPolicy.c b/src/moab/MPolicy.c
index 9b1e873..bfca663 100644
--- a/src/moab/MPolicy.c
+++ b/src/moab/MPolicy.c
@@ -147,10 +147,19 @@ extern mres_t     *MRes[];
 
 */
 
-/* NYI:  must handle effqduration */
-
+static int MQueueCheckSingleJob(
+  mjob_t	*J,
+  int		*Reason,
+  mpar_t	*P,
+  mpar_t	*GP,
+  int            PLevel,
+  int            MaxNC,
+  int            MaxPC,
+  unsigned long  MaxWCLimit,
+  int            OrigPIndex,
+  mbool_t        UpdateStats);
 
-   
+/* NYI:  must handle effqduration */
 
 int MQueueSelectJobs(
 
@@ -171,27 +180,14 @@ int MQueueSelectJobs(
 
   mjob_t  *J;
 
-  char     DValue[MAX_MNAME];
-  enum MJobDependEnum DType;
-
   mpar_t  *P;
   mpar_t  *GP;
 
-  long     PS;
-
   int      LReason[MAX_MREJREASON];
-  int      PReason;
 
   int     *Reason;
 
   int      PIndex;
-  int      PReq;
-
-  mreq_t  *RQ;
-
-  double   PE;
-
-  char     tmpLine[MAX_MLINE];
 
   const char *FName = "MQueueSelectJobs";
 
@@ -267,368 +263,410 @@ int MQueueSelectJobs(
       continue;
       }
 
-    RQ = J->Req[0]; /* FIXME */
+    if (MQueueCheckSingleJob(J, Reason, P, GP, PLevel,
+	MaxNC, MaxPC, MaxWCLimit, OrigPIndex, UpdateStats) == FAILURE)
+      continue;
 
-    /* if job removed */
+    /* NOTE:  effective queue duration not yet properly supported */
 
-    if (J->Name[0] == '\0')
-      {
-      Reason[marCorruption]++;
+    J->EffQueueDuration = (MSched.Time > J->SystemQueueTime) ? 
+      MSched.Time - J->SystemQueueTime : 0;
+ 
+    /* add job to destination queue */
 
-      continue;
-      }
+    DBG(5,fSCHED) DPrint("INFO:     job '%s' added to queue at slot %d\n",
+      J->Name,
+      sindex);
 
-    if (UpdateStats == TRUE)
-      {
-      J->BlockReason = 0;
+    DstQ[sindex++] = SrcQ[jindex];
+    }  /* END for (jindex) */
 
-      if (J->State == mjsIdle)
-        MStat.IdleJobs++;
-      }
+  /* terminate list */
 
-    PReq = MJobGetProcCount(J);
-    MJobGetPE(J,P,&PE);
-    PS   = (long)PReq * J->SpecWCLimit[0];
+  DstQ[sindex] = -1;
 
-    /* check partition */
+  DBG(1,fSCHED)
+    {
+    DBG(1,fSCHED) DPrint("INFO:     total jobs selected in partition %s: %d/%-d ",
+      MAList[ePartition][PIndex],
+      sindex,
+      jindex);
 
-    if (OrigPIndex != -1)
+    for (index = 0;index < MAX_MREJREASON;index++)
       {
-      if ((P->Index == 0) && !(J->Flags & (1 << mjfSpan)))
+      if (Reason[index] != 0)
         {
-        /* why?  what does partition '0' mean in partition mode? */
+        fprintf(mlog.logfp,"[%s: %d]",
+          MAllocRejType[index],
+          Reason[index]);
+        }
+      }    /* END for (index) */
 
-        DBG(3,fSCHED) DPrint("INFO:     job %s not considered for spanning\n",
-          J->Name);
+    fprintf(mlog.logfp,"\n");
+    }
 
-        Reason[marPartitionAccess]++;
+  if (sindex == 0)
+    return(FAILURE);
 
-        continue;
-        }
-      else if ((P->Index != 0) && (J->Flags & (1 << mjfSpan)))
-        {
-        DBG(3,fSCHED) DPrint("INFO:     spanning job %s not considered for partition scheduling\n",
-          J->Name);
+  return(SUCCESS);
+  }  /* END MQueueSelectJobs() */
 
-        Reason[marPartitionAccess]++;
+/*
+ * Helper for MQueueSelectJobs: performs the single job evaluation.
+ * Returns SUCCESS if job can be queued and FAILURE otherwise.
+ */
+static int MQueueCheckSingleJob(
+  mjob_t	*J,
+  int		*Reason,
+  mpar_t	*P,
+  mpar_t	*GP,
+  int            PLevel,
+  int            MaxNC,
+  int            MaxPC,
+  unsigned long  MaxWCLimit,
+  int            OrigPIndex,
+  mbool_t        UpdateStats)
 
-        continue;
-        }
+  {
+  char     DValue[MAX_MNAME];
+  enum MJobDependEnum DType;
 
-      if ((P->Index > 0) && (MUBMCheck(P->Index,J->PAL) == FAILURE))
-        {
-        DBG(7,fSCHED) DPrint("INFO:     job %s not considered for partition %s (allowed %s)\n",
-          J->Name,
-          P->Name,
-          MUListAttrs(ePartition,J->PAL[0]));
+  long     PS;
 
-        Reason[marPartitionAccess]++;
+  int      PReason;
 
-        continue;
-        }
-      }   /* END if (OrigPIndex != -1) */
+  int      PReq;
 
-    /* check job state */
+  mreq_t  *RQ;
 
-    if ((J->State != mjsIdle) && (J->State != mjsSuspended))
-      {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected (job in non-idle state '%s')\n",
-        J->Name,
-        MJobState[J->State]);
+  double   PE;
 
-      Reason[marState]++;
+  char     tmpLine[MAX_MLINE];
 
-      if ((MaxNC == MAX_MNODE) && 
-          (MaxWCLimit == MAX_MTIME) && 
-          (J->R != NULL))
-        {
-        if ((J->State != mjsStarting) && (J->State != mjsRunning))
-          MResDestroy(&J->R);
-        }
+  const char *FName = "MQueueCheckSingleJob";
 
-      continue;
-      }
+  RQ = J->Req[0]; /* FIXME */
 
-    /* check if job has been previously scheduled or deferred */
+  /* if job removed */
 
-    if ((J->EState != mjsIdle) && (J->EState != mjsSuspended))
+  if (J->Name[0] == '\0')
+    {
+    Reason[marCorruption]++;
+
+    return(FAILURE);
+    }
+
+  if (UpdateStats == TRUE)
+    {
+    J->BlockReason = 0;
+
+    if (J->State == mjsIdle)
+      MStat.IdleJobs++;
+    }
+
+  PReq = MJobGetProcCount(J);
+  /* XXX: PE is unused? */
+  MJobGetPE(J,P,&PE);
+  PS   = (long)PReq * J->SpecWCLimit[0];
+
+  /* check partition */
+
+  if (OrigPIndex != -1)
+    {
+    if ((P->Index == 0) && !(J->Flags & (1 << mjfSpan)))
       {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected (job in non-idle expected state: '%s')\n",
-        J->Name,
-        MJobState[J->EState]);
+      /* why?  what does partition '0' mean in partition mode? */
 
-      Reason[marEState]++;
+      DBG(3,fSCHED) DPrint("INFO:     job %s not considered for spanning\n",
+        J->Name);
 
-      if ((MaxNC == MAX_MNODE) && (MaxWCLimit == MAX_MTIME) && (J->R != NULL))
-        {
-        if ((J->EState != mjsStarting) && (J->EState != mjsRunning))
-          MResDestroy(&J->R);
-        }
+      Reason[marPartitionAccess]++;
 
-      continue;
+      return(FAILURE);
       }
+    else if ((P->Index != 0) && (J->Flags & (1 << mjfSpan)))
+      {
+      DBG(3,fSCHED) DPrint("INFO:     spanning job %s not considered for partition scheduling\n",
+        J->Name);
 
-    /* check available procs */
+      Reason[marPartitionAccess]++;
+
+      return(FAILURE);
+      }
 
-    if (PReq > P->CRes.Procs)
+    if ((P->Index > 0) && (MUBMCheck(P->Index,J->PAL) == FAILURE))
       {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected in partition %s (exceeds configured procs: %d > %d)\n",
+      DBG(7,fSCHED) DPrint("INFO:     job %s not considered for partition %s (allowed %s)\n",
         J->Name,
         P->Name,
-        PReq,
-        P->CRes.Procs);
+        MUListAttrs(ePartition,J->PAL[0]));
 
-      Reason[marNodeCount]++;
+      Reason[marPartitionAccess]++;
 
-      if (P->Index <= 0)
-        {
-        if (J->R != NULL)
-          MResDestroy(&J->R);
+      return(FAILURE);
+      }
+    }   /* END if (OrigPIndex != -1) */
 
-        if (J->Hold == 0)
-          {
-          MJobSetHold(
-            J,
-            (1 << mhDefer),
-            MSched.DeferTime,
-            mhrNoResources,
-            "exceeds partition configured procs");
-          }
-        }
+  /* check job state */
 
-      continue;
+  if ((J->State != mjsIdle) && (J->State != mjsSuspended))
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected (job in non-idle state '%s')\n",
+      J->Name,
+      MJobState[J->State]);
+
+    Reason[marState]++;
+
+    if ((MaxNC == MAX_MNODE) && 
+        (MaxWCLimit == MAX_MTIME) && 
+        (J->R != NULL))
+      {
+      if ((J->State != mjsStarting) && (J->State != mjsRunning))
+        MResDestroy(&J->R);
       }
 
-    /* check partition specific limits */
+    return(FAILURE);
+    }
 
-    if (MJobCheckLimits(
-          J,
-          PLevel,
-          P,
-          (1 << mlSystem),
-          tmpLine) == FAILURE)
+  /* check if job has been previously scheduled or deferred */
+
+  if ((J->EState != mjsIdle) && (J->EState != mjsSuspended))
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected (job in non-idle expected state: '%s')\n",
+      J->Name,
+      MJobState[J->EState]);
+
+    Reason[marEState]++;
+
+    if ((MaxNC == MAX_MNODE) && (MaxWCLimit == MAX_MTIME) && (J->R != NULL))
       {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (%s)\n",
-        J->Name,
-        P->Name,
-        tmpLine);
+      if ((J->EState != mjsStarting) && (J->EState != mjsRunning))
+        MResDestroy(&J->R);
+      }
 
-      Reason[marSystemLimits]++;
+    return(FAILURE);
+    }
 
-      if (P->Index <= 0)
-        {
-        if (J->R != NULL)
-          MResDestroy(&J->R);
+  /* check available procs */
+
+  if (PReq > P->CRes.Procs)
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected in partition %s (exceeds configured procs: %d > %d)\n",
+      J->Name,
+      P->Name,
+      PReq,
+      P->CRes.Procs);
 
+    Reason[marNodeCount]++;
+
+    if (P->Index <= 0)
+      {
+      if (J->R != NULL)
+        MResDestroy(&J->R);
+
+      if (J->Hold == 0)
+        {
         MJobSetHold(
           J,
           (1 << mhDefer),
           MSched.DeferTime,
-          mhrSystemLimits,
-          "exceeds system proc/job limit");
+          mhrNoResources,
+          "exceeds partition configured procs");
         }
+      }
 
-      continue;
-      }  /* END if (MJobCheckLimits() == FAILURE) */
-
-    /* check job size */
-
-    if (PReq > MaxPC)
-      {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected in partition %s (exceeds window size: %d > %d)\n",
-        J->Name,
-        P->Name,
-        PReq,
-        MaxPC);
+    return(FAILURE);
+    }
 
-      Reason[marNodeCount]++;
+  /* check partition specific limits */
 
-      continue;
-      }
+  if (MJobCheckLimits(
+        J,
+        PLevel,
+        P,
+        (1 << mlSystem),
+        tmpLine) == FAILURE)
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (%s)\n",
+      J->Name,
+      P->Name,
+      tmpLine);
 
-    /* check job duration */
+    Reason[marSystemLimits]++;
 
-    if (J->SpecWCLimit[0] > MaxWCLimit)
+    if (P->Index <= 0)
       {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected in partition %s (exceeds window time: %ld > %ld)\n",
-        J->Name,
-        P->Name,
-        J->SpecWCLimit[0],
-        MaxWCLimit);
-
-      Reason[marTime]++;
+      if (J->R != NULL)
+        MResDestroy(&J->R);
 
-      continue;
+      MJobSetHold(
+        J,
+        (1 << mhDefer),
+        MSched.DeferTime,
+        mhrSystemLimits,
+        "exceeds system proc/job limit");
       }
 
-    /* check partition class support */
-
-    if (P->Index > 0)
-      {
-      if (MUNumListGetCount(J->StartPriority,RQ->DRes.PSlot,P->CRes.PSlot,0,NULL) == FAILURE)
-        {
-        DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (classes not supported '%s')\n",
-          J->Name,
-          P->Name,
-          MUCAListToString(RQ->DRes.PSlot,P->CRes.PSlot,NULL));
+    return(FAILURE);
+    }  /* END if (MJobCheckLimits() == FAILURE) */
 
-        Reason[marClass]++;
+  /* check job size */
 
-        if (J->R != NULL)
-          MResDestroy(&J->R);
+  if (PReq > MaxPC)
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected in partition %s (exceeds window size: %d > %d)\n",
+      J->Name,
+      P->Name,
+      PReq,
+      MaxPC);
 
-        continue;
-        }
-      }      /* END if (PIndex) */
+    Reason[marNodeCount]++;
 
-    if (MJobCheckDependency(J,&DType,DValue) == FAILURE)
-      {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected (dependent on job '%s' %s)\n",
-        J->Name,
-        DValue,
-        MJobDependType[DType]);
+    return(FAILURE);
+    }
 
-      if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
-        {
-        J->SystemQueueTime = MSched.Time;
-        }
+  /* check job duration */
 
-      Reason[marDepend]++;
+  if (J->SpecWCLimit[0] > MaxWCLimit)
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected in partition %s (exceeds window time: %ld > %ld)\n",
+      J->Name,
+      P->Name,
+      J->SpecWCLimit[0],
+      MaxWCLimit);
 
-      if ((MaxNC == MAX_MNODE) &&
-          (MaxWCLimit == MAX_MTIME) &&
-          (J->R != NULL))
-        {
-        MResDestroy(&J->R);
-        }
+    Reason[marTime]++;
 
-      continue;
-      }  /* END if (MJobCheckDependency(J,&JDepend) == FAILURE) */
+    return(FAILURE);
+    }
 
-    /* check partition active job policies */
+  /* check partition class support */
 
-    if (MJobCheckPolicies(
-          J,
-          PLevel,
-          (1 << mlActive),
-          P,   /* NOTE:  may set to &MPar[0] */
-          &PReason,
-          NULL,
-          MAX_MTIME) == FAILURE)
+  if (P->Index > 0)
+    {
+    if (MUNumListGetCount(J->StartPriority,RQ->DRes.PSlot,P->CRes.PSlot,0,NULL) == FAILURE)
       {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (policy failure: '%s')\n",
+      DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (classes not supported '%s')\n",
         J->Name,
         P->Name,
-        MPolicyRejection[PReason]);
-
-      if (PLevel == ptHARD)
-        {
-        if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
-          {
-          J->SystemQueueTime = MSched.Time;
-          }
-        }
+        MUCAListToString(RQ->DRes.PSlot,P->CRes.PSlot,NULL));
 
-      Reason[marPolicy]++;
+      Reason[marClass]++;
 
-      if ((MaxNC == MAX_MNODE) && 
-          (MaxWCLimit == MAX_MTIME) && 
-          (J->R != NULL))
-        {
+      if (J->R != NULL)
         MResDestroy(&J->R);
-        }
 
-      continue;
+      return(FAILURE);
       }
+    }      /* END if (PIndex) */
 
-    J->Cred.U->MTime = MSched.Time;
-    J->Cred.G->MTime = MSched.Time;
+  if (MJobCheckDependency(J,&DType,DValue) == FAILURE)
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected (dependent on job '%s' %s)\n",
+      J->Name,
+      DValue,
+      MJobDependType[DType]);
+
+    if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
+      {
+      J->SystemQueueTime = MSched.Time;
+      }
 
-    if (J->Cred.A != NULL)
-      J->Cred.A->MTime = MSched.Time;
+    Reason[marDepend]++;
 
-    if (MPar[0].FSC.FSPolicy != fspNONE)
+    if ((MaxNC == MAX_MNODE) &&
+        (MaxWCLimit == MAX_MTIME) &&
+        (J->R != NULL))
       {
-      int OIndex;
+      MResDestroy(&J->R);
+      }
 
-      if (MFSCheckCap(NULL,J,P,&OIndex) == FAILURE)
-        {
-        DBG(5,fSCHED) DPrint("INFO:     job '%s' exceeds %s FS cap\n",
-          J->Name,
-          (OIndex > 0) ? MXO[OIndex] : "NONE");
- 
-        if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
-          {
-          J->SystemQueueTime = MSched.Time;
-          }
- 
-        Reason[marFairShare]++;
+    return(FAILURE);
+    }  /* END if (MJobCheckDependency(J,&JDepend) == FAILURE) */
 
-        continue;
-        }
-      }    /* END if (FS[0].FSPolicy != fspNONE) */
+  /* check partition active job policies */
 
-    /* NOTE:  idle queue policies handled in MQueueSelectAllJobs() */
+  if (MJobCheckPolicies(
+        J,
+        PLevel,
+        (1 << mlActive),
+        P,   /* NOTE:  may set to &MPar[0] */
+        &PReason,
+        NULL,
+        MAX_MTIME) == FAILURE)
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (policy failure: '%s')\n",
+      J->Name,
+      P->Name,
+      MPolicyRejection[PReason]);
 
-    if (MLocalCheckFairnessPolicy(J,MSched.Time,NULL) == FAILURE)
+    if (PLevel == ptHARD)
       {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (violates local fairness policy)\n",
-        J->Name,
-        P->Name);
-
-      if (GP->JobPrioAccrualPolicy == jpapFullPolicy) 
+      if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
         {
         J->SystemQueueTime = MSched.Time;
         }
+      }
 
-      Reason[marPolicy]++;
+    Reason[marPolicy]++;
 
-      continue;
+    if ((MaxNC == MAX_MNODE) && 
+        (MaxWCLimit == MAX_MTIME) && 
+        (J->R != NULL))
+      {
+      MResDestroy(&J->R);
       }
 
-    /* NOTE:  effective queue duration not yet properly supported */
+    return(FAILURE);
+    }
 
-    J->EffQueueDuration = (MSched.Time > J->SystemQueueTime) ? 
-      MSched.Time - J->SystemQueueTime : 0;
- 
-    /* add job to destination queue */
+  J->Cred.U->MTime = MSched.Time;
+  J->Cred.G->MTime = MSched.Time;
 
-    DBG(5,fSCHED) DPrint("INFO:     job '%s' added to queue at slot %d\n",
-      J->Name,
-      sindex);
+  if (J->Cred.A != NULL)
+    J->Cred.A->MTime = MSched.Time;
 
-    DstQ[sindex++] = SrcQ[jindex];
-    }  /* END for (jindex) */
+  if (MPar[0].FSC.FSPolicy != fspNONE)
+    {
+    int OIndex;
 
-  /* terminate list */
+    if (MFSCheckCap(NULL,J,P,&OIndex) == FAILURE)
+      {
+      DBG(5,fSCHED) DPrint("INFO:     job '%s' exceeds %s FS cap\n",
+        J->Name,
+        (OIndex > 0) ? MXO[OIndex] : "NONE");
+ 
+      if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
+        {
+        J->SystemQueueTime = MSched.Time;
+        }
+ 
+      Reason[marFairShare]++;
 
-  DstQ[sindex] = -1;
+      return(FAILURE);
+      }
+    }    /* END if (FS[0].FSPolicy != fspNONE) */
 
-  DBG(1,fSCHED)
+  /* NOTE:  idle queue policies handled in MQueueSelectAllJobs() */
+
+  if (MLocalCheckFairnessPolicy(J,MSched.Time,NULL) == FAILURE)
     {
-    DBG(1,fSCHED) DPrint("INFO:     total jobs selected in partition %s: %d/%-d ",
-      MAList[ePartition][PIndex],
-      sindex,
-      jindex);
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (violates local fairness policy)\n",
+      J->Name,
+      P->Name);
 
-    for (index = 0;index < MAX_MREJREASON;index++)
+    if (GP->JobPrioAccrualPolicy == jpapFullPolicy) 
       {
-      if (Reason[index] != 0)
-        {
-        fprintf(mlog.logfp,"[%s: %d]",
-          MAllocRejType[index],
-          Reason[index]);
-        }
-      }    /* END for (index) */
+      J->SystemQueueTime = MSched.Time;
+      }
 
-    fprintf(mlog.logfp,"\n");
-    }
+    Reason[marPolicy]++;
 
-  if (sindex == 0)
     return(FAILURE);
+    }
 
   return(SUCCESS);
-  }  /* END MQueueSelectJobs() */
+  }  /* END MQueueCheckSingleJob() */
 
 
 
-- 
1.5.5.4

>From 1edc761361fce6466ecad0ddc9aa5d8a54689f37 Mon Sep 17 00:00:00 2001
From: Eygene Ryabinkin <[EMAIL PROTECTED]>
Date: Wed, 21 Mar 2007 10:57:59 +0300
Subject: [PATCH] Prepare the MSchedProcessJobs() for the two-pass scheduling.

Transformed the part of the original MJobGetPAL() function to the
new public function MJobFindDefPart() that determines the default
partition for a job.

MQueueSelectJobs() prototype was modified: the OnlyDefPart flag was
added. It enables the examination of jobs that have the passed
partition to be the default one; all other jobs are skipped in the
selection process. When OnlyDefPart is set to FALSE the original
behaviour is restored: all jobs are examined.

The patch is no-op from the functional point of view: the OnlyDefPart
argument to the MQueueSelectJobs() was set to FALSE everywhere.

Patch was tested on the RRC-KI Grid cluster and yet showed no
regressions on its daily operations.

Signed-off-by: Eygene Ryabinkin <[EMAIL PROTECTED]>
---
 include/moab-proto.h |    3 +-
 src/moab/MPar.c      |  107 ++++++++++++++++++++++++++++++--------------------
 src/moab/MPolicy.c   |   13 ++++++-
 src/moab/MQueue.c    |    2 +
 src/moab/MSched.c    |   16 +++++--
 src/server/UserI.c   |    1 +
 6 files changed, 92 insertions(+), 50 deletions(-)

diff --git a/include/moab-proto.h b/include/moab-proto.h
index 0ee4c81..65d0a29 100644
--- a/include/moab-proto.h
+++ b/include/moab-proto.h
@@ -399,6 +399,7 @@ int MJobSetState(mjob_t *,enum MJobStateEnum);
 int MJobPreempt(mjob_t *,mjob_t **,enum MPreemptPolicyEnum,char *,int *);
 int MJobResume(mjob_t *,char *,int *);
 int MJobGetPAL(mjob_t *,int *,int *,mpar_t **);
+mpar_t *MJobFindDefPart(mjob_t *, mclass_t *, int *);
 int MJobRemove(mjob_t *);
 int MJobGetAccount(mjob_t *,mgcred_t **);
 int MJobSetCreds(mjob_t *,char *,char *,char *);
@@ -494,7 +495,7 @@ int MQueueDiagnose(mjob_t **,int *,int,mpar_t *,char *,int);
 int MQueueCheckStatus(void);
 int MQueueGetRequeueValue(int *,long,long,double *);
 int MQueueSelectAllJobs(mjob_t **,int,mpar_t *,int *,int,int,int,char *);
-int MQueueSelectJobs(int *,int *,int,int,int,unsigned long,int,int *,mbool_t);
+int MQueueSelectJobs(int *,int *,int,int,int,unsigned long,int,int *,mbool_t,mbool_t);
 int MQueueAddAJob(mjob_t *);
 int MQueueRemoveAJob(mjob_t *,int);
 int MQueueBackFill(int *,int,mpar_t *);
diff --git a/src/moab/MPar.c b/src/moab/MPar.c
index 390fe4a..4017321 100644
--- a/src/moab/MPar.c
+++ b/src/moab/MPar.c
@@ -347,52 +347,11 @@ int MJobGetPAL(
   if (PAL != NULL)
     MUBMCopy(PAL,tmpPAL,MAX_MPAR);
  
-  /* determine allowed partition default (precedence: U,G,A,C,S,0) */
+  /* determine allowed partition default */
  
   if (PDef != NULL)
     {
-    if ((J->Cred.U->F.PDef != NULL) &&
-        (J->Cred.U->F.PDef != &MPar[0]) &&
-         MUBMCheck(((mpar_t *)J->Cred.U->F.PDef)->Index,tmpPAL))
-      {
-      *PDef = (mpar_t  *)J->Cred.U->F.PDef;
-      }
-    else if ((J->Cred.G->F.PDef != NULL) &&
-             (J->Cred.G->F.PDef != &MPar[0]) &&
-              MUBMCheck(((mpar_t *)J->Cred.G->F.PDef)->Index,tmpPAL))
-      {
-      *PDef = (mpar_t  *)J->Cred.G->F.PDef;
-      }
-    else if ((J->Cred.A != NULL) &&
-             (J->Cred.A->F.PDef != NULL) &&
-             (J->Cred.A->F.PDef != &MPar[0]) &&
-              MUBMCheck(((mpar_t *)J->Cred.A->F.PDef)->Index,tmpPAL))
-      {
-      *PDef = (mpar_t  *)J->Cred.A->F.PDef;
-      }
-    else if ((C != NULL) &&
-             (C->F.PDef != NULL) &&
-             (C->F.PDef != &MPar[0]) &&
-              MUBMCheck(((mpar_t *)C->F.PDef)->Index,tmpPAL)) 
-      {
-      *PDef = (mpar_t  *)C->F.PDef;
-      }
-    else if ((J->Cred.Q != NULL) &&
-             (J->Cred.Q->F.PDef != NULL) &&
-             (J->Cred.Q->F.PDef != &MPar[0]) &&
-              MUBMCheck(((mpar_t *)J->Cred.Q->F.PDef)->Index,tmpPAL))
-      {
-      *PDef = (mpar_t  *)J->Cred.Q->F.PDef;
-      }
-    else if ((MPar[0].F.PDef != NULL) &&
-             (MPar[0].F.PDef != &MPar[0]))
-      {
-      *PDef = (mpar_t  *)MPar[0].F.PDef;
-      }
-    else
-      {
-      *PDef = &MPar[MDEF_SYSPDEF];
-      }
+    *PDef = MJobFindDefPart(J, C, tmpPAL);
  
     /* verify access to default partition */
  
@@ -439,7 +398,69 @@ int MJobGetPAL(
   return(SUCCESS);
   }  /* END MJobGetPAL() */
 
+/*
+ * Determines default partition for a job (precedence: U,G,A,C,S,0)
+ * 'PAL' is consulted to determine partition access if it is not NULL.
+ * 'C' is consulted for the default partition if it is not NULL.
+ */
+mpar_t *MJobFindDefPart(
+  mjob_t   *J,     /* I:  job                                */
+  mclass_t *C,     /* I:  job class                          */
+  int      *PAL)   /* I:  partition access list              */
+
+  {
+  mpar_t   *PDef;
+
+  if ((J->Cred.U->F.PDef != NULL) &&
+      (J->Cred.U->F.PDef != &MPar[0]) &&
+      (PAL == NULL ||
+       MUBMCheck(((mpar_t *)J->Cred.U->F.PDef)->Index,PAL)))
+    {
+    PDef = (mpar_t  *)J->Cred.U->F.PDef;
+    }
+  else if ((J->Cred.G->F.PDef != NULL) &&
+           (J->Cred.G->F.PDef != &MPar[0]) &&
+           (PAL == NULL ||
+            MUBMCheck(((mpar_t *)J->Cred.G->F.PDef)->Index,PAL)))
+    {
+    PDef = (mpar_t  *)J->Cred.G->F.PDef;
+    }
+  else if ((J->Cred.A != NULL) &&
+           (J->Cred.A->F.PDef != NULL) &&
+           (J->Cred.A->F.PDef != &MPar[0]) &&
+           (PAL == NULL ||
+            MUBMCheck(((mpar_t *)J->Cred.A->F.PDef)->Index,PAL)))
+    {
+    PDef = (mpar_t  *)J->Cred.A->F.PDef;
+    }
+  else if ((C != NULL) &&
+           (C->F.PDef != NULL) &&
+           (C->F.PDef != &MPar[0]) &&
+           (PAL == NULL ||
+            MUBMCheck(((mpar_t *)C->F.PDef)->Index,PAL)))
+    {
+    PDef = (mpar_t  *)C->F.PDef;
+    }
+  else if ((J->Cred.Q != NULL) &&
+           (J->Cred.Q->F.PDef != NULL) &&
+           (J->Cred.Q->F.PDef != &MPar[0]) &&
+           (PAL == NULL ||
+	    MUBMCheck(((mpar_t *)J->Cred.Q->F.PDef)->Index,PAL)))
+    {
+    PDef = (mpar_t  *)J->Cred.Q->F.PDef;
+    }
+  else if ((MPar[0].F.PDef != NULL) &&
+           (MPar[0].F.PDef != &MPar[0]))
+    {
+    PDef = (mpar_t  *)MPar[0].F.PDef;
+    }
+  else
+    {
+    PDef = &MPar[MDEF_SYSPDEF];
+    }
 
+  return PDef;
+  }  /* END MJobFindDefPart() */
 
 
 int MParFind(
diff --git a/src/moab/MPolicy.c b/src/moab/MPolicy.c
index bfca663..c60a435 100644
--- a/src/moab/MPolicy.c
+++ b/src/moab/MPolicy.c
@@ -171,7 +171,8 @@ int MQueueSelectJobs(
   unsigned long  MaxWCLimit,    /* I */
   int            OrigPIndex,    /* I */
   int           *FReason,       /* O */
-  mbool_t        UpdateStats)   /* I:  (boolean) */
+  mbool_t        UpdateStats,   /* I:  (boolean) */
+  mbool_t        OnlyDefPart)   /* I:  (boolean) */
 
   {
   int      index;
@@ -263,6 +264,16 @@ int MQueueSelectJobs(
       continue;
       }
 
+    if (OnlyDefPart == TRUE && MJobFindDefPart(J, NULL, NULL) != P)
+      {
+      DBG(7,fSCHED) DPrint("INFO:     skipping job[%d] '%s', only default partition check requested (and current partition is %s)\n",
+        jindex,
+        J->Name,
+	P->Name);
+
+      continue;
+      }
+
     if (MQueueCheckSingleJob(J, Reason, P, GP, PLevel,
 	MaxNC, MaxPC, MaxWCLimit, OrigPIndex, UpdateStats) == FAILURE)
       continue;
diff --git a/src/moab/MQueue.c b/src/moab/MQueue.c
index 106a012..aba2bbb 100644
--- a/src/moab/MQueue.c
+++ b/src/moab/MQueue.c
@@ -446,6 +446,7 @@ int MQueueBackFill(
           AdjBFTime,
           P->Index,
           NULL,
+          FALSE,
           FALSE) == FAILURE)
       {
       DBG(5,fSCHED) DPrint("INFO:     no jobs meet BF window criteria in partition %s\n",
@@ -1516,6 +1517,7 @@ int MQueueCheckStatus()
                 MAX_MTIME,
                 -1,
                 ReasonList,
+                FALSE,
                 FALSE) == FAILURE)
             {
             strcpy(DeferMessage,"SCHED_INFO:  job cannot run.  Reason: cannot select job\n");
diff --git a/src/moab/MSched.c b/src/moab/MSched.c
index 8434272..92fbae0 100644
--- a/src/moab/MSched.c
+++ b/src/moab/MSched.c
@@ -6949,6 +6949,7 @@ int MSchedProcessJobs(
             MAX_MTIME,
             -1,
             NULL,
+            FALSE,
             FALSE) == SUCCESS)
         {
         memcpy(MFQ,tmpQ,sizeof(MFQ));
@@ -6971,7 +6972,8 @@ int MSchedProcessJobs(
         MAX_MTIME,
         -1,
         NULL,
-        TRUE);
+        TRUE,
+        FALSE);
 
       /* schedule priority jobs */
 
@@ -6996,7 +6998,8 @@ int MSchedProcessJobs(
                 MAX_MTIME,
                 PIndex,
                 NULL,
-                TRUE) == SUCCESS)
+                TRUE,
+                FALSE) == SUCCESS)
             {
             MQueueScheduleIJobs(tmpQ,&MPar[PIndex]);
 
@@ -7023,7 +7026,8 @@ int MSchedProcessJobs(
         MAX_MTIME,
         -1,
         NULL,
-        TRUE);
+        TRUE,
+        FALSE);
 
       if (CurrentQ[0] != -1)
         {
@@ -7055,7 +7059,8 @@ int MSchedProcessJobs(
                 MAX_MTIME,
                 PIndex,
                 NULL,
-                TRUE) == SUCCESS)
+                TRUE,
+                FALSE) == SUCCESS)
             {
             MQueueBackFill(tmpQ,ptHARD,&MPar[PIndex]);
             }
@@ -7097,7 +7102,8 @@ int MSchedProcessJobs(
     MAX_MTIME,
     -1,
     NULL,
-    TRUE);
+    TRUE,
+    FALSE);
 
   /* must sort/order MUIQ */
 
diff --git a/src/server/UserI.c b/src/server/UserI.c
index 9bcd8da..c409c28 100644
--- a/src/server/UserI.c
+++ b/src/server/UserI.c
@@ -1790,6 +1790,7 @@ int UIJobShow(
           MAX_MTIME,
           P->Index,
           Reason,
+          FALSE,
           FALSE) == FAILURE) || (DstQ[0] == -1))
       {
       for (index = 0;index < MAX_MREJREASON;index++)
-- 
1.5.5.4

>From 769793a5657a13e6fb60ec9aebda5d6b712ee8ab Mon Sep 17 00:00:00 2001
From: Eygene Ryabinkin <[EMAIL PROTECTED]>
Date: Wed, 21 Mar 2007 14:10:22 +0300
Subject: [PATCH] Fixed default partition handling by the two-pass scheduling.

MSchedProcessJobs() uses two-pass scheduling: first pass over all
partitions schedules jobs that can be put to their default partitions
and the second pass schedules the rest of the jobs. Backfilling is
disabled on the first pass: we should first load the queue with the
eligible jobs and only then do the backfilling.

Patch was tested on the RRC-KI Grid cluster and yet showed no
regressions on its daily operations. The default partition ('PDEF')
statement is working as expected: jobs are first scheduled to the
default partition and only after the default partition nodes are
busy they go to the rest of the partitions.

Signed-off-by: Eygene Ryabinkin <[EMAIL PROTECTED]>
---
 src/moab/MSched.c |   81 ++++++++++++++++++++++++++++++----------------------
 1 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/src/moab/MSched.c b/src/moab/MSched.c
index 92fbae0..9ef5338 100644
--- a/src/moab/MSched.c
+++ b/src/moab/MSched.c
@@ -6977,44 +6977,57 @@ int MSchedProcessJobs(
 
       /* schedule priority jobs */
 
+#ifdef M_SCHEDULE_ON_PARTITIONS
+#error Symbol M_SCHEDULE_ON_PARTITIONS is already defined. Fix me, please.
+#endif
+#define M_SCHEDULE_ON_PARTITONS(_OnlyDefPart, _DoBackfill) \
+	do {								\
+        for (PIndex = 0;PIndex < MAX_MPAR;PIndex++)			\
+          {								\
+          if (((PIndex == 0) && (MPar[2].ConfigNodes == 0)) ||		\
+              (MPar[PIndex].ConfigNodes == 0))				\
+            {								\
+            continue;							\
+            }								\
+									\
+          MOQueueInitialize(tmpQ);					\
+									\
+          if (MQueueSelectJobs(						\
+                CurrentQ,						\
+                tmpQ,							\
+                ptSOFT,							\
+                MAX_MNODE,						\
+                MAX_MTASK,						\
+                MAX_MTIME,						\
+                PIndex,							\
+                NULL,							\
+                TRUE,							\
+                _OnlyDefPart) == SUCCESS)				\
+            {								\
+            MQueueScheduleIJobs(tmpQ,&MPar[PIndex]);			\
+									\
+            if (_DoBackfill == TRUE && MPar[PIndex].BFPolicy != ptOFF)	\
+              {								\
+              /* backfill jobs using 'soft' policy constraints */	\
+									\
+              MQueueBackFill(tmpQ,ptSOFT,&MPar[PIndex]);		\
+              }								\
+            }								\
+									\
+          MOQueueDestroy(tmpQ,FALSE);					\
+          }    /* END for (PIndex) */					\
+	  } while (0)
+
       if (CurrentQ[0] != -1)
         {
-        for (PIndex = 0;PIndex < MAX_MPAR;PIndex++)
-          {
-          if (((PIndex == 0) && (MPar[2].ConfigNodes == 0)) ||
-              (MPar[PIndex].ConfigNodes == 0))
-            {
-            continue;
-            }
-
-          MOQueueInitialize(tmpQ);
-
-          if (MQueueSelectJobs(
-                CurrentQ,
-                tmpQ,
-                ptSOFT,
-                MAX_MNODE,
-                MAX_MTASK,
-                MAX_MTIME,
-                PIndex,
-                NULL,
-                TRUE,
-                FALSE) == SUCCESS)
-            {
-            MQueueScheduleIJobs(tmpQ,&MPar[PIndex]);
-
-            if (MPar[PIndex].BFPolicy != ptOFF)
-              {
-              /* backfill jobs using 'soft' policy constraints */
-
-              MQueueBackFill(tmpQ,ptSOFT,&MPar[PIndex]);
-              }
-            }
-
-          MOQueueDestroy(tmpQ,FALSE);
-          }    /* END for (PIndex) */
+	/* schedule jobs on their default partitions; skip backfilling  */
+	M_SCHEDULE_ON_PARTITONS(TRUE, FALSE);
+	/* schedule jobs on all partitions; do backfilling  */
+	M_SCHEDULE_ON_PARTITONS(FALSE, TRUE);
         }      /* END if (GlobalSQ[0] != -1) */
 
+#undef M_SCHEDULE_ON_PARTITONS
+
       MOQueueDestroy(CurrentQ,TRUE);
 
       MQueueSelectJobs(
-- 
1.5.5.4

_______________________________________________
mauiusers mailing list
[email protected]
http://www.supercluster.org/mailman/listinfo/mauiusers

Reply via email to