Skip to content

Commit 4bf2f92

Browse files
fixed router health check
1 parent af9680a commit 4bf2f92

File tree

2 files changed

+23
-7
lines changed

2 files changed

+23
-7
lines changed

api/health/service.go

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
package health
55

66
import (
7+
stdjson "encoding/json"
78
"net/http"
89
"time"
910

@@ -74,15 +75,25 @@ type APIHealthReply struct {
7475
func (as *apiServer) Health(_ *http.Request, _ *APIHealthArgs, reply *APIHealthReply) error {
7576
as.log.Info("Health.health called")
7677
reply.Checks, reply.Healthy = as.Results()
77-
return nil
78+
if reply.Healthy {
79+
return nil
80+
}
81+
replyStr, err := stdjson.Marshal(reply.Checks)
82+
as.log.Warn("Health.health is returning an error: %s", string(replyStr))
83+
return err
7884
}
7985

8086
// GetLiveness returns a summation of the health of the node
8187
// Deprecated: in favor of Health
8288
func (as *apiServer) GetLiveness(_ *http.Request, _ *APIHealthArgs, reply *APIHealthReply) error {
83-
as.log.Info("Health: GetLiveness called")
89+
as.log.Info("Health.getLiveness called")
8490
reply.Checks, reply.Healthy = as.Results()
85-
return nil
91+
if reply.Healthy {
92+
return nil
93+
}
94+
replyStr, err := stdjson.Marshal(reply.Checks)
95+
as.log.Warn("Health.getLiveness is returning an error: %s", string(replyStr))
96+
return err
8697
}
8798

8899
type noOp struct{}

snow/networking/router/chain_router.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,6 @@ func (cr *ChainRouter) Initialize(
120120
// Assumes [cr.lock] is held
121121
func (cr *ChainRouter) removeRequest(id ids.ID) {
122122
delete(cr.requests, id)
123-
if len(cr.requests) == 0 {
124-
cr.lastTimeNoOutstanding = cr.clock.Time()
125-
}
126123
cr.metrics.outstandingRequests.Set(float64(len(cr.requests)))
127124
}
128125

@@ -140,8 +137,12 @@ func (cr *ChainRouter) RegisterRequest(
140137
) {
141138
uniqueRequestID := createRequestID(validatorID, chainID, requestID)
142139
cr.lock.Lock()
140+
if len(cr.requests) == 0 {
141+
cr.lastTimeNoOutstanding = cr.clock.Time()
142+
}
143143
// Add to the set of unfulfilled requests
144144
cr.requests[uniqueRequestID] = request{Time: cr.clock.Time(), MsgType: msgType}
145+
cr.metrics.outstandingRequests.Set(float64(len(cr.requests)))
145146
cr.lock.Unlock()
146147
// Register a timeout to fire if we don't get a reply in time.
147148
var timeoutHandler func() // Called upon timeout
@@ -760,7 +761,11 @@ func (cr *ChainRouter) HealthCheck() (interface{}, error) {
760761
healthy = healthy && numOutstandingReqs <= cr.healthConfig.MaxOutstandingRequests
761762
details["outstandingRequests"] = numOutstandingReqs
762763

763-
timeSinceNoOutstandingRequests := cr.clock.Time().Sub(cr.lastTimeNoOutstanding)
764+
now := cr.clock.Time()
765+
if numOutstandingReqs == 0 {
766+
cr.lastTimeNoOutstanding = now
767+
}
768+
timeSinceNoOutstandingRequests := now.Sub(cr.lastTimeNoOutstanding)
764769
healthy = healthy && timeSinceNoOutstandingRequests <= cr.healthConfig.MaxTimeSinceNoOutstandingRequests
765770
details["timeSinceNoOutstandingRequests"] = timeSinceNoOutstandingRequests.String()
766771

0 commit comments

Comments
 (0)