Wietse Venema:
> Santiago Romero:
> > 
> > > Please wait for an updated patch, we believe we have identified the
> > > cause and reproduced the symptoms (in that order). I have a candidate
> > > patch, but I expect Wietse will send an updated more polished version
> > > in the not too distant future.
> > >   
> > 
> >  Ok, I'll wait for it. I'm going to roll back to "ubuntu packages" (I 
> > already applied the patch and was testing it).
> 
> It will be later today. I don't have much time so I want to have
> it really right the first time. Code that is right takes more work
> than code that works.

To apply this patch, cd into the Postfix-2.5.* top-level source
directory and execute:

$ patch < thismessage

We were able to reproduce the scheduler looping problem, and it
does not recur with the patched version.

        Wietse

diff -cr /var/tmp/postfix-2.5.6/src/oqmgr/qmgr_transport.c 
src/oqmgr/qmgr_transport.c
*** /var/tmp/postfix-2.5.6/src/oqmgr/qmgr_transport.c   Sun Dec  2 13:13:26 2007
--- src/oqmgr/qmgr_transport.c  Thu Mar  5 16:06:43 2009
***************
*** 286,291 ****
--- 286,293 ----
            continue;
        need = xport->pending + 1;
        for (queue = xport->queue_list.next; queue; queue = queue->peers.next) {
+           if (QMGR_QUEUE_READY(queue) == 0)
+               continue;
            if ((need -= MIN5af51743e4eef(queue->window - queue->busy_refcount,
                                          queue->todo_refcount)) <= 0) {
                QMGR_LIST_ROTATE(qmgr_transport_list, xport);
diff -cr /var/tmp/postfix-2.5.6/src/qmgr/qmgr.h src/qmgr/qmgr.h
*** /var/tmp/postfix-2.5.6/src/qmgr/qmgr.h      Sat Dec  8 11:01:59 2007
--- src/qmgr/qmgr.h     Thu Mar  5 16:36:32 2009
***************
*** 436,441 ****
--- 436,442 ----
  
  extern QMGR_ENTRY *qmgr_job_entry_select(QMGR_TRANSPORT *);
  extern QMGR_PEER *qmgr_peer_select(QMGR_JOB *);
+ extern void qmgr_job_blocker_update(QMGR_QUEUE *);
  
  extern QMGR_JOB *qmgr_job_obtain(QMGR_MESSAGE *, QMGR_TRANSPORT *);
  extern void qmgr_job_free(QMGR_JOB *);
diff -cr /var/tmp/postfix-2.5.6/src/qmgr/qmgr_entry.c src/qmgr/qmgr_entry.c
*** /var/tmp/postfix-2.5.6/src/qmgr/qmgr_entry.c        Fri Dec 14 17:47:21 2007
--- src/qmgr/qmgr_entry.c       Thu Mar  5 16:29:46 2009
***************
*** 299,327 ****
      }
  
      /*
!      * If the queue was blocking some of the jobs on the job list, check if
!      * the concurrency limit has lifted. If there are still some pending
!      * deliveries, give it a try and unmark all transport blockers at once.
!      * The qmgr_job_entry_select() will do the rest. In either case make sure
!      * the queue is not marked as a blocker anymore, with extra handling of
!      * queues which were declared dead.
       * 
!      * Note that changing the blocker status also affects the candidate cache.
!      * Most of the cases would be automatically recognized by the current job
!      * change, but we play safe and reset the cache explicitly below.
!      * 
!      * Keeping the transport blocker tag odd is an easy way to make sure the 
tag
!      * never matches jobs that are not explicitly marked as blockers.
       */
!     if (queue->blocker_tag == transport->blocker_tag) {
!       if (queue->window > queue->busy_refcount && queue->todo.next != 0) {
!           transport->blocker_tag += 2;
!           transport->job_current = transport->job_list.next;
!           transport->candidate_cache_current = 0;
!       }
!       if (queue->window > queue->busy_refcount || QMGR_QUEUE_THROTTLED(queue))
!           queue->blocker_tag = 0;
      }
  
      /*
       * When there are no more entries for this peer, discard the peer
--- 299,323 ----
      }
  
      /*
!      * We implement a rate-limited queue by emulating a slow delivery
!      * channel. We insert the artificial delays with qmgr_queue_suspend().
       * 
!      * When a queue is suspended, we must postpone any job scheduling 
decisions
!      * until the queue is resumed. Otherwise, we make those decisions now.
!      * The job scheduling decisions are made by qmgr_job_blocker_update().
       */
!     if (which == QMGR_QUEUE_BUSY && transport->rate_delay > 0) {
!       if (queue->window > 1)
!           msg_panic("%s: queue %s/%s: window %d > 1 on rate-limited service",
!                     myname, transport->name, queue->name, queue->window);
!       if (QMGR_QUEUE_THROTTLED(queue))        /* XXX */
!           qmgr_queue_unthrottle(queue);
!       if (QMGR_QUEUE_READY(queue))
!           qmgr_queue_suspend(queue, transport->rate_delay);
      }
+     if (!QMGR_QUEUE_SUSPENDED(queue)
+       && queue->blocker_tag == transport->blocker_tag)
+       qmgr_job_blocker_update(queue);
  
      /*
       * When there are no more entries for this peer, discard the peer
***************
*** 336,354 ****
       */
      if (which == QMGR_QUEUE_BUSY)
        queue->last_done = event_time();
- 
-     /*
-      * Suspend a rate-limited queue, so that mail trickles out.
-      */
-     if (which == QMGR_QUEUE_BUSY && transport->rate_delay > 0) {
-       if (queue->window > 1)
-           msg_panic("%s: queue %s/%s: window %d > 1 on rate-limited service",
-                     myname, transport->name, queue->name, queue->window);
-       if (QMGR_QUEUE_THROTTLED(queue))        /* XXX */
-           qmgr_queue_unthrottle(queue);
-       if (QMGR_QUEUE_READY(queue))
-           qmgr_queue_suspend(queue, transport->rate_delay);
-     }
  
      /*
       * When the in-core queue for this site is empty and when this site is
--- 332,337 ----
diff -cr /var/tmp/postfix-2.5.6/src/qmgr/qmgr_job.c src/qmgr/qmgr_job.c
*** /var/tmp/postfix-2.5.6/src/qmgr/qmgr_job.c  Tue Nov  7 11:34:07 2006
--- src/qmgr/qmgr_job.c Thu Mar  5 16:43:36 2009
***************
*** 18,23 ****
--- 18,26 ----
  /*
  /*    QMGR_ENTRY *qmgr_job_entry_select(transport)
  /*    QMGR_TRANSPORT *transport;
+ /*
+ /*    void    qmgr_job_blocker_update(queue)
+ /*    QMGR_QUEUE *queue;
  /* DESCRIPTION
  /*    These routines add/delete/manipulate per-transport jobs.
  /*    Each job corresponds to a specific transport and message.
***************
*** 38,43 ****
--- 41,51 ----
  /*    If necessary, an attempt to read more recipients into core is made.
  /*    This can result in creation of more job, queue and entry structures.
  /*
+ /*    qmgr_job_blocker_update() updates the status of blocked
+ /*    jobs after a decrease in the queue's concurrency level,
+ /*    after the queue is throttled, or after the queue is resumed
+ /*    from suspension.
+ /*
  /*    qmgr_job_move_limits() takes care of proper distribution of the
  /*    per-transport recipients limit among the per-transport jobs.
  /*    Should be called whenever a job's recipient slot becomes available.
***************
*** 937,939 ****
--- 945,980 ----
      transport->job_current = 0;
      return (0);
  }
+ 
+ /* qmgr_job_blocker_update - update "blocked job" status */
+ 
+ void     qmgr_job_blocker_update(QMGR_QUEUE *queue)
+ {
+     QMGR_TRANSPORT *transport = queue->transport;
+ 
+     /*
+      * If the queue was blocking some of the jobs on the job list, check if
+      * the concurrency limit has lifted. If there are still some pending
+      * deliveries, give it a try and unmark all transport blockers at once.
+      * The qmgr_job_entry_select() will do the rest. In either case make sure
+      * the queue is not marked as a blocker anymore, with extra handling of
+      * queues which were declared dead.
+      * 
+      * Note that changing the blocker status also affects the candidate cache.
+      * Most of the cases would be automatically recognized by the current job
+      * change, but we play safe and reset the cache explicitly below.
+      * 
+      * Keeping the transport blocker tag odd is an easy way to make sure the 
tag
+      * never matches jobs that are not explicitly marked as blockers.
+      */
+     if (queue->blocker_tag == transport->blocker_tag) {
+       if (queue->window > queue->busy_refcount && queue->todo.next != 0) {
+           transport->blocker_tag += 2;
+           transport->job_current = transport->job_list.next;
+           transport->candidate_cache_current = 0;
+       }
+       if (queue->window > queue->busy_refcount || QMGR_QUEUE_THROTTLED(queue))
+           queue->blocker_tag = 0;
+     }
+ }
+ 
diff -cr /var/tmp/postfix-2.5.6/src/qmgr/qmgr_queue.c src/qmgr/qmgr_queue.c
*** /var/tmp/postfix-2.5.6/src/qmgr/qmgr_queue.c        Sat Dec  8 09:59:34 2007
--- src/qmgr/qmgr_queue.c       Thu Mar  5 17:35:24 2009
***************
*** 66,72 ****
  /*    "slow open" mode, and eliminates the "thundering herd" problem.
  /*
  /*    qmgr_queue_suspend() suspends delivery for this destination
! /*    briefly.
  /* DIAGNOSTICS
  /*    Panic: consistency check failure.
  /* LICENSE
--- 66,76 ----
  /*    "slow open" mode, and eliminates the "thundering herd" problem.
  /*
  /*    qmgr_queue_suspend() suspends delivery for this destination
! /*    briefly. This function invalidates any scheduling decisions
! /*    that are based on the present queue's concurrency window.
! /*    To compensate for work skipped by qmgr_entry_done(), the
! /*    status of blocker jobs is re-evaluated after the queue is
! /*    resumed.
  /* DIAGNOSTICS
  /*    Panic: consistency check failure.
  /* LICENSE
***************
*** 152,160 ****
--- 156,175 ----
      /*
       * Every event handler that leaves a queue in the "ready" state should
       * remove the queue when it is empty.
+      * 
+      * XXX Do not omit the redundant test below. It is here to simplify code
+      * consistency checks. The check is trivially eliminated by the compiler
+      * optimizer. There is no need to sacrifice code clarity for the sake of
+      * performance.
+      * 
+      * XXX Do not expose the blocker job logic here. Rate-limited queues are 
not
+      * a performance-critical feature. Here, too, there is no need to 
sacrifice
+      * code clarity for the sake of performance.
       */
      if (QMGR_QUEUE_READY(queue) && queue->todo.next == 0 && queue->busy.next 
== 0)
        qmgr_queue_done(queue);
+     else
+       qmgr_job_blocker_update(queue);
  }
  
  /* qmgr_queue_suspend - briefly suspend a destination */
diff -cr /var/tmp/postfix-2.5.6/src/qmgr/qmgr_transport.c 
src/qmgr/qmgr_transport.c
*** /var/tmp/postfix-2.5.6/src/qmgr/qmgr_transport.c    Sun Dec  2 12:53:17 2007
--- src/qmgr/qmgr_transport.c   Thu Mar  5 15:08:44 2009
***************
*** 291,296 ****
--- 291,298 ----
            continue;
        need = xport->pending + 1;
        for (queue = xport->queue_list.next; queue; queue = queue->peers.next) {
+           if (QMGR_QUEUE_READY(queue) == 0)
+               continue;
            if ((need -= MIN5af51743e4eef(queue->window - queue->busy_refcount,
                                          queue->todo_refcount)) <= 0) {
                QMGR_LIST_ROTATE(qmgr_transport_list, xport, peers);

Reply via email to