Skip to content

Commit bc2841f

Browse files
Merge pull request #330 from ava-labs/update-health-checks
updated health check limits
2 parents 93d00d3 + 1b247b3 commit bc2841f

File tree

2 files changed

+7
-11
lines changed

2 files changed

+7
-11
lines changed

main/params.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,9 @@ func avalancheFlagSet() *flag.FlagSet {
180180
fs.Duration(networkHealthMaxTimeSinceMsgReceivedKey, time.Minute, "Network layer returns unhealthy if haven't received a message for at least this much time")
181181
fs.Float64(networkHealthMaxPortionSendQueueFillKey, 0.9, "Network layer returns unhealthy if more than this portion of the pending send queue is full")
182182
fs.Uint(networkHealthMinPeersKey, 1, "Network layer returns unhealthy if connected to less than this many peers")
183-
fs.Float64(networkHealthMaxSendFailRateKey, .25, "Network layer reports unhealthy if more than this portion of attempted message sends fail")
183+
fs.Float64(networkHealthMaxSendFailRateKey, .9, "Network layer reports unhealthy if more than this portion of attempted message sends fail")
184184
// Router Health
185-
fs.Float64(routerHealthMaxDropRateKey, 0.25, "Node reports unhealthy if the router drops more than this portion of messages.")
185+
fs.Float64(routerHealthMaxDropRateKey, 1, "Node reports unhealthy if the router drops more than this portion of messages.")
186186
fs.Uint(routerHealthMaxOutstandingRequestsKey, 1024, "Node reports unhealthy if there are more than this many outstanding consensus requests (Get, PullQuery, etc.) over all chains")
187187
fs.Duration(networkHealthMaxTimeSinceNoReqsKey, 5*time.Minute, "Node reports unhealthy if there is at least 1 outstanding request continuously for this duration")
188188

@@ -226,7 +226,7 @@ func avalancheFlagSet() *flag.FlagSet {
226226
fs.Int(snowConcurrentRepollsKey, 4, "Minimum number of concurrent polls for finalizing consensus")
227227
fs.Int(snowOptimalProcessingKey, 50, "Optimal number of processing vertices in consensus")
228228
fs.Int(snowMaxProcessingKey, 1024, "Maximum number of processing items to be considered healthy")
229-
fs.Duration(snowMaxTimeProcessingKey, 10*time.Second, "Maximum amount of time an item should be processing and still be healthy")
229+
fs.Duration(snowMaxTimeProcessingKey, 2*time.Minute, "Maximum amount of time an item should be processing and still be healthy")
230230
fs.Int64(snowEpochFirstTransition, 1607626800, "Unix timestamp of the first epoch transaction, in seconds. Defaults to 12/10/2020 @ 7:00pm (UTC)")
231231
fs.Duration(snowEpochDuration, 6*time.Hour, "Duration of each epoch")
232232

network/network.go

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1471,27 +1471,23 @@ func (n *network) restartOnDisconnect() {
14711471
// 2) An error if the health check reports unhealthy
14721472
// Assumes [n.stateLock] is not held
14731473
func (n *network) HealthCheck() (interface{}, error) {
1474-
details := map[string]interface{}{}
1475-
14761474
// Get some data with the state lock held
14771475
connectedTo := 0
14781476
n.stateLock.RLock()
14791477
for _, peer := range n.peers {
14801478
if peer != nil && peer.connected.GetValue() {
14811479
connectedTo++
1482-
if connectedTo > int(n.healthConfig.MinConnectedPeers) {
1483-
break
1484-
}
14851480
}
14861481
}
14871482
pendingSendBytes := n.pendingBytes
14881483
sendFailRate := n.sendFailRateCalculator.Read()
14891484
n.stateLock.RUnlock()
14901485

14911486
// Make sure we're connected to at least the minimum number of peers
1492-
isSufficientlyConnected := connectedTo >= int(n.healthConfig.MinConnectedPeers)
1493-
healthy := isSufficientlyConnected
1494-
details["connectedToMinPeers"] = isSufficientlyConnected
1487+
healthy := connectedTo >= int(n.healthConfig.MinConnectedPeers)
1488+
details := map[string]interface{}{
1489+
"connectedPeers": connectedTo,
1490+
}
14951491

14961492
// Make sure we've received an incoming message within the threshold
14971493
now := n.clock.Time()

0 commit comments

Comments
 (0)