Skip to content

Commit 53f36a1

Browse files
Merge pull request #312 from ava-labs/fix-health-checks
fixed router health check
2 parents af9680a + f6793f4 commit 53f36a1

File tree

2 files changed

+26
-8
lines changed

2 files changed

+26
-8
lines changed

api/health/service.go

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,16 @@ import (
77
"net/http"
88
"time"
99

10+
stdjson "encoding/json"
11+
1012
"github.com/gorilla/rpc/v2"
1113

1214
"github.com/ava-labs/avalanchego/health"
13-
healthlib "github.com/ava-labs/avalanchego/health"
1415
"github.com/ava-labs/avalanchego/snow/engine/common"
1516
"github.com/ava-labs/avalanchego/utils/json"
1617
"github.com/ava-labs/avalanchego/utils/logging"
18+
19+
healthlib "github.com/ava-labs/avalanchego/health"
1720
)
1821

1922
// Service wraps a [healthlib.Service]. Handler() returns a handler
@@ -74,15 +77,25 @@ type APIHealthReply struct {
7477
func (as *apiServer) Health(_ *http.Request, _ *APIHealthArgs, reply *APIHealthReply) error {
7578
as.log.Info("Health.health called")
7679
reply.Checks, reply.Healthy = as.Results()
77-
return nil
80+
if reply.Healthy {
81+
return nil
82+
}
83+
replyStr, err := stdjson.Marshal(reply.Checks)
84+
as.log.Warn("Health.health is returning an error: %s", string(replyStr))
85+
return err
7886
}
7987

8088
// GetLiveness returns a summation of the health of the node
8189
// Deprecated: in favor of Health
8290
func (as *apiServer) GetLiveness(_ *http.Request, _ *APIHealthArgs, reply *APIHealthReply) error {
83-
as.log.Info("Health: GetLiveness called")
91+
as.log.Info("Health.getLiveness called")
8492
reply.Checks, reply.Healthy = as.Results()
85-
return nil
93+
if reply.Healthy {
94+
return nil
95+
}
96+
replyStr, err := stdjson.Marshal(reply.Checks)
97+
as.log.Warn("Health.getLiveness is returning an error: %s", string(replyStr))
98+
return err
8699
}
87100

88101
type noOp struct{}

snow/networking/router/chain_router.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,6 @@ func (cr *ChainRouter) Initialize(
120120
// Assumes [cr.lock] is held
121121
func (cr *ChainRouter) removeRequest(id ids.ID) {
122122
delete(cr.requests, id)
123-
if len(cr.requests) == 0 {
124-
cr.lastTimeNoOutstanding = cr.clock.Time()
125-
}
126123
cr.metrics.outstandingRequests.Set(float64(len(cr.requests)))
127124
}
128125

@@ -140,8 +137,12 @@ func (cr *ChainRouter) RegisterRequest(
140137
) {
141138
uniqueRequestID := createRequestID(validatorID, chainID, requestID)
142139
cr.lock.Lock()
140+
if len(cr.requests) == 0 {
141+
cr.lastTimeNoOutstanding = cr.clock.Time()
142+
}
143143
// Add to the set of unfulfilled requests
144144
cr.requests[uniqueRequestID] = request{Time: cr.clock.Time(), MsgType: msgType}
145+
cr.metrics.outstandingRequests.Set(float64(len(cr.requests)))
145146
cr.lock.Unlock()
146147
// Register a timeout to fire if we don't get a reply in time.
147148
var timeoutHandler func() // Called upon timeout
@@ -760,7 +761,11 @@ func (cr *ChainRouter) HealthCheck() (interface{}, error) {
760761
healthy = healthy && numOutstandingReqs <= cr.healthConfig.MaxOutstandingRequests
761762
details["outstandingRequests"] = numOutstandingReqs
762763

763-
timeSinceNoOutstandingRequests := cr.clock.Time().Sub(cr.lastTimeNoOutstanding)
764+
now := cr.clock.Time()
765+
if numOutstandingReqs == 0 {
766+
cr.lastTimeNoOutstanding = now
767+
}
768+
timeSinceNoOutstandingRequests := now.Sub(cr.lastTimeNoOutstanding)
764769
healthy = healthy && timeSinceNoOutstandingRequests <= cr.healthConfig.MaxTimeSinceNoOutstandingRequests
765770
details["timeSinceNoOutstandingRequests"] = timeSinceNoOutstandingRequests.String()
766771

0 commit comments

Comments
 (0)