Your probes are very impatient to my eye. 2 sec timeout is aggressive. You can set a more patient view of your backends in another director, and restart if a request failed, sending to the slower but more likely to be available backends. Use 60 sec timeout on the patient view.
Stefan Caunter Operations TorstarDigital 416.561.4871 On 2011-02-16, at 4:31 PM, "Frank Farmer" <[email protected]> wrote: > I'm having an issue where my backends fall behind for a little while > (maybe 30 seconds or so), and then varnish stops probing entirely for > minutes at a time, even though the host has long since recovered. I'm > near capacity, currently, so I can't afford to lose a backend for > minutes -- the extra traffic tends to back up my other backends, which > then also end up taken out of the pool for minutes, even though they > recover in seconds. > > Is there anything I can do to control this interval? I'd love to have > varnish never wait more than, say, 30 seconds between probes, even at > the worst of times. > > Varnish version: > > # varnishd -V > varnishd (varnish-2.1.3 SVN 5049:5055) > Copyright (c) 2006-2009 Linpro AS / Verdens Gang AS > > Excerpt of Apache access_log showing probes received by one of my backends: > > # ... NORMAL PROBE FREQUENCY > app005 - - - [16/Feb/2011:18:44:00 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:44:01 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:44:02 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:44:04 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > # PROBES STOP FOR 2 MINUTES > app005 - - - [16/Feb/2011:18:46:07 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:07 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:07 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:07 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:07 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:09 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:09 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:09 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:09 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:10 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:10 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:11 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:12 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:13 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:14 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:15 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:16 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:17 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:18 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:19 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:46:22 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > # PROBES STOP FOR 2 MINUTES > app005 - - - [16/Feb/2011:18:48:23 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:48:23 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > # ... SNIP SEVERAL MINUTES OF NORMAL PROBING > app005 - - - [16/Feb/2011:18:54:33 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:54:33 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:18:54:33 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > # PROBES STOP FOR 7 MINUTES > app005 - - - [16/Feb/2011:19:01:33 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:19:01:45 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > app005 - - - [16/Feb/2011:19:01:46 +0000] "GET /health.html HTTP/1.1" > 200 24 "-" "-" > # ...PROBES OPERATE NORMALLY FOR HOURS... > > > Backend config: > > > backend app001 { > .host = "app001-private"; > .port = "8880"; > .probe = { > .url = "/health.html"; > .timeout = 2s; > .interval = 1s; > .window = 10; > .threshold = 8; > } > .connect_timeout = 2s; > } > > backend app002 { > .host = "app002-private"; > .port = "8880"; > .probe = { > .url = "/health.html"; > .timeout = 2s; > .interval = 1s; > .window = 10; > .threshold = 8; > } > .connect_timeout = 2s; > } > > backend app003 { > .host = "app003-private"; > .port = "8880"; > .probe = { > .url = "/health.html"; > .timeout = 2s; > .interval = 1s; > .window = 10; > .threshold = 8; > } > .connect_timeout = 2s; > } > > backend app005 { > .host = "app005-private"; > .port = "8880"; > .probe = { > .url = "/health.html"; > .timeout = 2s; > .interval = 1s; > .window = 10; > .threshold = 5; > .initial = 10; > } > .connect_timeout = 2s; > } > > backend app006 { > .host = "app006-private"; > .port = "8880"; > .probe = { > .url = "/health.html"; > .timeout = 2s; > .interval = 1s; > .window = 10; > .threshold = 5; > .initial = 10; > } > .connect_timeout = 2s; > } > > director app_servers random { > { > .backend = app001; > .weight = 10; > } > { > .backend = app002; > .weight = 100; > } > { > .backend = app003; > .weight = 75; > } > { > .backend = app005; > .weight = 300; > } > { > .backend = app006; > .weight = 300; > } > } > > _______________________________________________ > varnish-misc mailing list > [email protected] > http://www.varnish-cache.org/lists/mailman/listinfo/varnish-misc _______________________________________________ varnish-misc mailing list [email protected] http://www.varnish-cache.org/lists/mailman/listinfo/varnish-misc
