Skip to content

Commit 143b580

Browse files
Rework ReplicationLagWarning alert
The previous version was creating alerts after a fresh install: - The oplog would not be full in this case - No operation might cause the lag to be wrongly high, or - ...the platform is in a transient / startup phase We would create this alert even if the oplog was not full, because the delta was too small. We here avoid that by ensuring this alert is only created if the oplog circular buffer is fully utilized. This makes more sense because in other cases, it means we have margin. Issue: ZENKO-5047
1 parent f64f6d0 commit 143b580

File tree

2 files changed

+130
-32
lines changed

2 files changed

+130
-32
lines changed

monitoring/mongodb/alerts.test.yaml

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,11 @@ tests:
168168
- name: ReplicationLagWarning
169169
interval: 1m
170170
input_series:
171+
- series: mongodb_oplog_stats_storageStats_maxSize{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0", rs_nm="shard0-data"}
172+
values: 10000x15
173+
- series: mongodb_oplog_stats_storageStats_size{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0", rs_nm="shard0-data"}
174+
values: 96000x15
175+
171176
- series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-0.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="PRIMARY"}
172177
values: 5 35000+1000x10 45000
173178
- series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-1.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="SECONDARY"}
@@ -249,6 +254,71 @@ tests:
249254
eval_time: 12m
250255
exp_alerts: []
251256

257+
- name: ReplicationLagWarning with oplog window not full
258+
interval: 1m
259+
input_series:
260+
- series: mongodb_oplog_stats_storageStats_maxSize{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0", rs_nm="shard0-data"}
261+
values: 100000x15
262+
- series: mongodb_oplog_stats_storageStats_size{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0", rs_nm="shard0-data"}
263+
values: 1000+1000x15
264+
265+
- series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-0.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="PRIMARY"}
266+
values: 5 35000+1000x10 45000
267+
- series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-1.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="SECONDARY"}
268+
values: 0 24000+1000x10 39000
269+
- series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-2.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="SECONDARY"}
270+
values: 2 2000+1000x7 7000+0x2 41000
271+
272+
- series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-1", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-0.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="PRIMARY"}
273+
values: 5 34000+1000x10 44000
274+
- series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-1", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-1.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="SECONDARY"}
275+
values: 0 26000+1000x10 40000
276+
- series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-1", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-2.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="SECONDARY"}
277+
values: 2 1000+1000x7 6000+0x2 40000
278+
279+
- series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-2", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-0.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="PRIMARY"}
280+
values: 5 12000+1000x10 43000
281+
- series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-2", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-1.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="SECONDARY"}
282+
values: 0 9000+1000x10 38000
283+
- series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-2", rs_nm="shard0-data", member_idx="data-db-mongodb-sharded-shard0-data-2.data-db-mongodb-sharded-shardsrv-headless.svc.cluster.local", member_state="SECONDARY"}
284+
values: 2 3000+1000x7 8000+0x2 42000
285+
286+
- series: mongodb_mongod_replset_my_state{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0"}
287+
values: 1+0x12
288+
- series: mongodb_mongod_replset_my_state{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-1"}
289+
values: 2+0x12
290+
- series: mongodb_mongod_replset_my_state{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-2"}
291+
values: 2+0x12
292+
293+
- series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-configsvr-1", rs_nm="configsvr", member_idx="data-db-mongodb-sharded-configsvr-0.data-db-mongodb-sharded-configsvr-headless.svc.cluster.local", member_state="PRIMARY"}
294+
values: 71 8300 9500
295+
- series: mongodb_rs_members_optimeDate{namespace="zenko",pod="data-db-mongodb-sharded-configsvr-1", rs_nm="configsvr", member_idx="data-db-mongodb-sharded-configsvr-1.data-db-mongodb-sharded-configsvr-headless.svc.cluster.local", member_state="SECONDARY"}
296+
values: 50 8200 9400
297+
298+
# We simulate a constant oplog window for this test. In real life, the oplog window can increase
299+
# or decrease depending on the load.
300+
- series: mongodb_mongod_replset_oplog_head_timestamp{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0"}
301+
values: 38000+1000x12
302+
- series: mongodb_mongod_replset_oplog_tail_timestamp{namespace="zenko",pod="data-db-mongodb-sharded-shard0-data-0"}
303+
values: 1000+1000x12
304+
305+
alert_rule_test:
306+
- alertname: ReplicationLagWarning
307+
eval_time: 1m
308+
exp_alerts: []
309+
- alertname: ReplicationLagWarning
310+
eval_time: 2m
311+
exp_alerts: []
312+
- alertname: ReplicationLagWarning
313+
eval_time: 10m
314+
exp_alerts: []
315+
- alertname: ReplicationLagWarning
316+
eval_time: 11m
317+
exp_alerts: []
318+
- alertname: ReplicationLagWarning
319+
eval_time: 12m
320+
exp_alerts: []
321+
252322
- name: TooManyClientConnectionsWarning
253323
interval: 1m
254324
input_series:

monitoring/mongodb/alerts.yaml

Lines changed: 60 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -103,24 +103,38 @@ groups:
103103
- alert: ReplicationLagWarning
104104
expr: |
105105
label_replace(
106-
max by(rs_nm, member_idx) (
107-
(
108-
max by(pod, rs_nm) (
109-
mongodb_rs_members_optimeDate{namespace="${namespace}", pod=~"${service}.*", member_state="PRIMARY"}
106+
(
107+
max by(rs_nm, member_idx) (
108+
(
109+
max by(pod, rs_nm) (
110+
mongodb_rs_members_optimeDate{namespace="${namespace}", pod=~"${service}.*", member_state="PRIMARY"}
111+
)
112+
- ignoring(member_idx) group_right min by(pod, rs_nm, member_idx) (
113+
mongodb_rs_members_optimeDate{namespace="${namespace}", pod=~"${service}.*", member_state="SECONDARY"}
114+
)
115+
) / scalar(
116+
max by(rs_nm) (
117+
(mongodb_mongod_replset_oplog_head_timestamp{namespace="${namespace}", pod=~"${service}.*"}
118+
- on(pod)
119+
mongodb_mongod_replset_oplog_tail_timestamp{namespace="${namespace}", pod=~"${service}.*"})
120+
* on(pod) group_left()
121+
(mongodb_mongod_replset_my_state{namespace="${namespace}", pod=~"${service}.*"} == 1)
122+
)
110123
)
111-
- ignoring(member_idx) group_right min by(pod, rs_nm, member_idx) (
112-
mongodb_rs_members_optimeDate{namespace="${namespace}", pod=~"${service}.*", member_state="SECONDARY"}
113-
)
114-
) / scalar(
115-
max by(rs_nm) (
116-
(mongodb_mongod_replset_oplog_head_timestamp{namespace="${namespace}", pod=~"${service}.*"}
117-
- on(pod)
118-
mongodb_mongod_replset_oplog_tail_timestamp{namespace="${namespace}", pod=~"${service}.*"})
119-
* on(pod) group_left()
120-
(mongodb_mongod_replset_my_state{namespace="${namespace}", pod=~"${service}.*"} == 1)
124+
) * 100 > ${replicationLagWarningThreshold}
125+
)
126+
AND on(rs_nm)
127+
(
128+
sum by (rs_nm) (
129+
(
130+
mongodb_oplog_stats_storageStats_size{namespace="${namespace}", pod=~"${service}.*"}
131+
/
132+
mongodb_oplog_stats_storageStats_maxSize{namespace="${namespace}", pod=~"${service}.*"}
121133
)
122-
)
123-
) * 100 > ${replicationLagWarningThreshold},
134+
* on(pod) group_left()
135+
(mongodb_mongod_replset_my_state{namespace="${namespace}", pod=~"${service}.*"} == 1)
136+
) >= 0.95
137+
),
124138
"member_idx", "$1", "member_idx", "(${service}[^.]*)\\.${service}.*"
125139
)
126140
for: 8m
@@ -133,24 +147,38 @@ groups:
133147
- alert: ReplicationLagWarning
134148
expr: |
135149
label_replace(
136-
max by(rs_nm, member_idx) (
137-
(
138-
max by(pod, rs_nm) (
139-
mongodb_rs_members_optimeDate{namespace="${namespace}", pod=~"${service}.*", member_state="PRIMARY"}
150+
(
151+
max by(rs_nm, member_idx) (
152+
(
153+
max by(pod, rs_nm) (
154+
mongodb_rs_members_optimeDate{namespace="${namespace}", pod=~"${service}.*", member_state="PRIMARY"}
155+
)
156+
- ignoring(member_idx) group_right min by(pod, rs_nm, member_idx) (
157+
mongodb_rs_members_optimeDate{namespace="${namespace}", pod=~"${service}.*", member_state="SECONDARY"}
158+
)
159+
) / scalar(
160+
max by(rs_nm) (
161+
(mongodb_mongod_replset_oplog_head_timestamp{namespace="${namespace}", pod=~"${service}.*"}
162+
- on(pod)
163+
mongodb_mongod_replset_oplog_tail_timestamp{namespace="${namespace}", pod=~"${service}.*"})
164+
* on(pod) group_left()
165+
(mongodb_mongod_replset_my_state{namespace="${namespace}", pod=~"${service}.*"} == 1)
166+
)
140167
)
141-
- ignoring(member_idx) group_right min by(pod, rs_nm, member_idx) (
142-
mongodb_rs_members_optimeDate{namespace="${namespace}", pod=~"${service}.*", member_state="SECONDARY"}
143-
)
144-
) / scalar(
145-
max by(rs_nm) (
146-
(mongodb_mongod_replset_oplog_head_timestamp{namespace="${namespace}", pod=~"${service}.*"}
147-
- on(pod)
148-
mongodb_mongod_replset_oplog_tail_timestamp{namespace="${namespace}", pod=~"${service}.*"})
149-
* on(pod) group_left()
150-
(mongodb_mongod_replset_my_state{namespace="${namespace}", pod=~"${service}.*"} == 1)
168+
) * 100 > ${replicationLagCriticalThreshold}
169+
)
170+
AND on(rs_nm)
171+
(
172+
sum by (rs_nm) (
173+
(
174+
mongodb_oplog_stats_storageStats_size{namespace="${namespace}", pod=~"${service}.*"}
175+
/
176+
mongodb_oplog_stats_storageStats_maxSize{namespace="${namespace}", pod=~"${service}.*"}
151177
)
152-
)
153-
) * 100 > ${replicationLagCriticalThreshold},
178+
* on(pod) group_left()
179+
(mongodb_mongod_replset_my_state{namespace="${namespace}", pod=~"${service}.*"} == 1)
180+
) >= 0.95
181+
),
154182
"member_idx", "$1", "member_idx", "(${service}[^.]*)\\.${service}.*"
155183
)
156184
for: 2m

0 commit comments

Comments
 (0)