@@ -36,12 +36,12 @@ type Distributor struct {
3636
3737 alertmanagerRing ring.ReadRing
3838 alertmanagerClientsPool ClientsPool
39-
40- logger log.Logger
39+ ringConfig RingConfig
40+ logger log.Logger
4141}
4242
4343// NewDistributor constructs a new Distributor
44- func NewDistributor (cfg ClientConfig , maxRecvMsgSize int64 , alertmanagersRing * ring.Ring , alertmanagerClientsPool ClientsPool , logger log.Logger , reg prometheus.Registerer ) (d * Distributor , err error ) {
44+ func NewDistributor (cfg ClientConfig , maxRecvMsgSize int64 , alertmanagersRing * ring.Ring , alertmanagerClientsPool ClientsPool , ringConfig RingConfig , logger log.Logger , reg prometheus.Registerer ) (d * Distributor , err error ) {
4545 if alertmanagerClientsPool == nil {
4646 alertmanagerClientsPool = newAlertmanagerClientsPool (client .NewRingServiceDiscovery (alertmanagersRing ), cfg , logger , reg )
4747 }
@@ -52,6 +52,7 @@ func NewDistributor(cfg ClientConfig, maxRecvMsgSize int64, alertmanagersRing *r
5252 maxRecvMsgSize : maxRecvMsgSize ,
5353 alertmanagerRing : alertmanagersRing ,
5454 alertmanagerClientsPool : alertmanagerClientsPool ,
55+ ringConfig : ringConfig ,
5556 }
5657
5758 d .Service = services .NewBasicService (nil , d .running , nil )
@@ -89,6 +90,9 @@ func (d *Distributor) isQuorumReadPath(p string) (bool, merger.Merger) {
8990 if strings .HasSuffix (path .Dir (p ), "/v2/silence" ) {
9091 return true , merger.V2SilenceID {}
9192 }
93+ if strings .HasSuffix (p , "/v2/receivers" ) {
94+ return true , merger.V2Receivers {}
95+ }
9296 return false , nil
9397}
9498
@@ -160,7 +164,7 @@ func (d *Distributor) doQuorum(userID string, w http.ResponseWriter, r *http.Req
160164 var responses []* httpgrpc.HTTPResponse
161165 var responsesMtx sync.Mutex
162166 grpcHeaders := httpToHttpgrpcHeaders (r .Header )
163- err = ring .DoBatch (r .Context (), RingOp , d .alertmanagerRing , nil , []uint32 {users .ShardByUser (userID )}, func (am ring.InstanceDesc , _ []int ) error {
167+ err = ring .DoBatch (r .Context (), GetRingOp ( d . ringConfig . DisableReplicaSetExtension ) , d .alertmanagerRing , nil , []uint32 {users .ShardByUser (userID )}, func (am ring.InstanceDesc , _ []int ) error {
164168 // Use a background context to make sure all alertmanagers get the request even if we return early.
165169 localCtx := opentracing .ContextWithSpan (user .InjectOrgID (context .Background (), userID ), opentracing .SpanFromContext (r .Context ()))
166170 sp , localCtx := opentracing .StartSpanFromContext (localCtx , "Distributor.doQuorum" )
@@ -207,7 +211,7 @@ func (d *Distributor) doQuorum(userID string, w http.ResponseWriter, r *http.Req
207211
208212func (d * Distributor ) doUnary (userID string , w http.ResponseWriter , r * http.Request , logger log.Logger ) {
209213 key := users .ShardByUser (userID )
210- replicationSet , err := d .alertmanagerRing .Get (key , RingOp , nil , nil , nil )
214+ replicationSet , err := d .alertmanagerRing .Get (key , GetRingOp ( d . ringConfig . DisableReplicaSetExtension ) , nil , nil , nil )
211215 if err != nil {
212216 level .Error (logger ).Log ("msg" , "failed to get replication set from the ring" , "err" , err )
213217 w .WriteHeader (http .StatusInternalServerError )
@@ -244,16 +248,30 @@ func (d *Distributor) doUnary(userID string, w http.ResponseWriter, r *http.Requ
244248 instances [i ], instances [j ] = instances [j ], instances [i ]
245249 })
246250 } else {
247- //Picking 1 instance at Random for Non-Get and Non-Delete Unary Read requests, as shuffling through large number of instances might increase complexity
248- randN := rand .Intn (len (replicationSet .Instances ))
249- instances = replicationSet .Instances [randN : randN + 1 ]
251+ // For POST requests, add retry logic to PutSilence
252+ if d .isUnaryWritePath (r .URL .Path ) {
253+ instances = replicationSet .Instances
254+ rand .Shuffle (len (instances ), func (i , j int ) {
255+ instances [i ], instances [j ] = instances [j ], instances [i ]
256+ })
257+ } else {
258+ // Other POST requests pick 1 instance at Random for Non-Get and Non-Delete Unary Read requests, as shuffling through large number of instances might increase complexity
259+ randN := rand .Intn (len (replicationSet .Instances ))
260+ instances = replicationSet .Instances [randN : randN + 1 ]
261+ }
250262 }
251263
252264 var lastErr error
253265 for _ , instance := range instances {
254266 resp , err := d .doRequest (ctx , instance , req )
255- // storing the last error message
256267 if err != nil {
268+ // For PutSilence with non-retryable errors, fail immediately
269+ if d .isUnaryWritePath (r .URL .Path ) && ! d .isRetryableError (err ) {
270+ level .Error (logger ).Log ("msg" , "non-retryable error from alertmanager" , "instance" , instance .Addr , "err" , err )
271+ respondFromError (err , w , logger )
272+ return
273+ }
274+ // storing the last error message
257275 lastErr = err
258276 continue
259277 }
@@ -267,6 +285,49 @@ func (d *Distributor) doUnary(userID string, w http.ResponseWriter, r *http.Requ
267285 }
268286}
269287
288+ // isRetryableError determines if an error is retryable (network/availability issues)
289+ // vs non-retryable (bad request, validation errors)
290+ func (d * Distributor ) isRetryableError (err error ) bool {
291+ if err == nil {
292+ return false
293+ }
294+
295+ // Check if it's an HTTP error with a status code
296+ httpResp , ok := httpgrpc .HTTPResponseFromError (errors .Cause (err ))
297+ if ok {
298+ statusCode := int (httpResp .Code )
299+
300+ if statusCode == http .StatusRequestTimeout || statusCode == http .StatusTooManyRequests || statusCode >= 500 {
301+ return true
302+ }
303+
304+ if statusCode >= 400 && statusCode < 500 {
305+ return false
306+ }
307+ }
308+
309+ // Network errors, context errors, etc. are retryable
310+ errorStr := err .Error ()
311+ retryablePatterns := []string {
312+ "connection refused" ,
313+ "connection reset" ,
314+ "timeout" ,
315+ "context deadline exceeded" ,
316+ "no such host" ,
317+ "network is unreachable" ,
318+ "broken pipe" ,
319+ }
320+
321+ for _ , pattern := range retryablePatterns {
322+ if strings .Contains (strings .ToLower (errorStr ), pattern ) {
323+ return true
324+ }
325+ }
326+
327+ // Default to retryable for unknown errors to maximize availability
328+ return true
329+ }
330+
270331func respondFromError (err error , w http.ResponseWriter , logger log.Logger ) {
271332 httpResp , ok := httpgrpc .HTTPResponseFromError (errors .Cause (err ))
272333 if ! ok {
0 commit comments