changeset 4207:4fc91bae6f83

Better recheck of dead upstream servers. Previously nginx used to mark backend again as live as soon as fail_timeout passes (10s by default) since last failure. On the other hand, detecting dead backend takes up to 60s (proxy_connect_timeout) in typical situation "backend is down and doesn't respond to any packets". This resulted in suboptimal behaviour in the above situation (up to 23% of requests were directed to dead backend with default settings). More detailed description of the problem may be found here (in Russian): http://mailman.nginx.org/pipermail/nginx-ru/2011-August/042172.html Fix is to only allow one request after fail_timeout passes, and mark backend as "live" only if this request succeeds. Note that with new code backend will not be marked "live" unless "check" request is completed, and this may take a while in some specific workloads (e.g. streaming). This is believed to be acceptable.
author Maxim Dounin <mdounin@mdounin.ru>
date Wed, 12 Oct 2011 14:22:48 +0000
parents 1a94a56a4e5d
children 470462cfa31b
files src/http/modules/ngx_http_upstream_ip_hash_module.c src/http/ngx_http_upstream_round_robin.c src/http/ngx_http_upstream_round_robin.h
diffstat 3 files changed, 18 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/src/http/modules/ngx_http_upstream_ip_hash_module.c
+++ b/src/http/modules/ngx_http_upstream_ip_hash_module.c
@@ -185,8 +185,8 @@ ngx_http_upstream_get_ip_hash_peer(ngx_p
                     break;
                 }
 
-                if (now - peer->accessed > peer->fail_timeout) {
-                    peer->fails = 0;
+                if (now - peer->checked > peer->fail_timeout) {
+                    peer->checked = now;
                     break;
                 }
             }
--- a/src/http/ngx_http_upstream_round_robin.c
+++ b/src/http/ngx_http_upstream_round_robin.c
@@ -443,8 +443,8 @@ ngx_http_upstream_get_round_robin_peer(n
                             break;
                         }
 
-                        if (now - peer->accessed > peer->fail_timeout) {
-                            peer->fails = 0;
+                        if (now - peer->checked > peer->fail_timeout) {
+                            peer->checked = now;
                             break;
                         }
 
@@ -491,8 +491,8 @@ ngx_http_upstream_get_round_robin_peer(n
                             break;
                         }
 
-                        if (now - peer->accessed > peer->fail_timeout) {
-                            peer->fails = 0;
+                        if (now - peer->checked > peer->fail_timeout) {
+                            peer->checked = now;
                             break;
                         }
 
@@ -663,15 +663,16 @@ ngx_http_upstream_free_round_robin_peer(
         return;
     }
 
+    peer = &rrp->peers->peer[rrp->current];
+
     if (state & NGX_PEER_FAILED) {
         now = ngx_time();
 
-        peer = &rrp->peers->peer[rrp->current];
-
         /* ngx_lock_mutex(rrp->peers->mutex); */
 
         peer->fails++;
         peer->accessed = now;
+        peer->checked = now;
 
         if (peer->max_fails) {
             peer->current_weight -= peer->weight / peer->max_fails;
@@ -686,6 +687,14 @@ ngx_http_upstream_free_round_robin_peer(
         }
 
         /* ngx_unlock_mutex(rrp->peers->mutex); */
+
+    } else {
+
+        /* mark peer live if check passed */
+
+        if (peer->accessed < peer->checked) {
+            peer->fails = 0;
+        }
     }
 
     rrp->current++;
--- a/src/http/ngx_http_upstream_round_robin.h
+++ b/src/http/ngx_http_upstream_round_robin.h
@@ -23,6 +23,7 @@ typedef struct {
 
     ngx_uint_t                      fails;
     time_t                          accessed;
+    time_t                          checked;
 
     ngx_uint_t                      max_fails;
     time_t                          fail_timeout;