comparison src/http/ngx_http_upstream_round_robin.c @ 4207:4fc91bae6f83

Better recheck of dead upstream servers. Previously nginx used to mark backend again as live as soon as fail_timeout passes (10s by default) since last failure. On the other hand, detecting dead backend takes up to 60s (proxy_connect_timeout) in typical situation "backend is down and doesn't respond to any packets". This resulted in suboptimal behaviour in the above situation (up to 23% of requests were directed to dead backend with default settings). More detailed description of the problem may be found here (in Russian): http://mailman.nginx.org/pipermail/nginx-ru/2011-August/042172.html Fix is to only allow one request after fail_timeout passes, and mark backend as "live" only if this request succeeds. Note that with new code backend will not be marked "live" unless "check" request is completed, and this may take a while in some specific workloads (e.g. streaming). This is believed to be acceptable.
author Maxim Dounin <mdounin@mdounin.ru>
date Wed, 12 Oct 2011 14:22:48 +0000
parents 9d4cbb09ae8b
children d620f497c50f
comparison
equal deleted inserted replaced
4206:1a94a56a4e5d 4207:4fc91bae6f83
441 || peer->fails < peer->max_fails) 441 || peer->fails < peer->max_fails)
442 { 442 {
443 break; 443 break;
444 } 444 }
445 445
446 if (now - peer->accessed > peer->fail_timeout) { 446 if (now - peer->checked > peer->fail_timeout) {
447 peer->fails = 0; 447 peer->checked = now;
448 break; 448 break;
449 } 449 }
450 450
451 peer->current_weight = 0; 451 peer->current_weight = 0;
452 452
489 || peer->fails < peer->max_fails) 489 || peer->fails < peer->max_fails)
490 { 490 {
491 break; 491 break;
492 } 492 }
493 493
494 if (now - peer->accessed > peer->fail_timeout) { 494 if (now - peer->checked > peer->fail_timeout) {
495 peer->fails = 0; 495 peer->checked = now;
496 break; 496 break;
497 } 497 }
498 498
499 peer->current_weight = 0; 499 peer->current_weight = 0;
500 500
661 if (rrp->peers->single) { 661 if (rrp->peers->single) {
662 pc->tries = 0; 662 pc->tries = 0;
663 return; 663 return;
664 } 664 }
665 665
666 peer = &rrp->peers->peer[rrp->current];
667
666 if (state & NGX_PEER_FAILED) { 668 if (state & NGX_PEER_FAILED) {
667 now = ngx_time(); 669 now = ngx_time();
668 670
669 peer = &rrp->peers->peer[rrp->current];
670
671 /* ngx_lock_mutex(rrp->peers->mutex); */ 671 /* ngx_lock_mutex(rrp->peers->mutex); */
672 672
673 peer->fails++; 673 peer->fails++;
674 peer->accessed = now; 674 peer->accessed = now;
675 peer->checked = now;
675 676
676 if (peer->max_fails) { 677 if (peer->max_fails) {
677 peer->current_weight -= peer->weight / peer->max_fails; 678 peer->current_weight -= peer->weight / peer->max_fails;
678 } 679 }
679 680
684 if (peer->current_weight < 0) { 685 if (peer->current_weight < 0) {
685 peer->current_weight = 0; 686 peer->current_weight = 0;
686 } 687 }
687 688
688 /* ngx_unlock_mutex(rrp->peers->mutex); */ 689 /* ngx_unlock_mutex(rrp->peers->mutex); */
690
691 } else {
692
693 /* mark peer live if check passed */
694
695 if (peer->accessed < peer->checked) {
696 peer->fails = 0;
697 }
689 } 698 }
690 699
691 rrp->current++; 700 rrp->current++;
692 701
693 if (rrp->current >= rrp->peers->number) { 702 if (rrp->current >= rrp->peers->number) {