Skip to content

Commit c3adc9e

Browse files
Merge pull request #1676 from qJkee/OCPEDGE-902
OCPEDGE-902: add SNO control plane high cpu usage alert
2 parents 58d79ab + 5b7bf4c commit c3adc9e

File tree

9 files changed

+837
-3
lines changed

9 files changed

+837
-3
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
name: cpu-utilization
5+
namespace: openshift-kube-apiserver
6+
spec:
7+
groups:
8+
- name: control-plane-cpu-utilization
9+
rules:
10+
- alert: HighOverallControlPlaneCPU
11+
annotations:
12+
summary: >-
13+
CPU utilization across control plane pods is more than 60% of total CPU. High CPU usage usually means that something goes wrong.
14+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md
15+
description: >-
16+
This level of CPU utlization of an control plane is probably not a problem under most circumstances, but high levels of utilization may indicate
17+
problems with cluster or control plane pods. To manage this alert or modify threshold it in case of false positives see the following link:
18+
https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html
19+
expr: |
20+
sum(rate(container_cpu_usage_seconds_total{namespace=~"openshift-.*",image!=""}[4m])) / ${CPU-COUNT} * 100 > 60
21+
for: 10m
22+
labels:
23+
namespace: openshift-kube-apiserver
24+
severity: warning
25+
- alert: ExtremelyHighIndividualControlPlaneCPU
26+
annotations:
27+
summary: >-
28+
CPU utilization across control plane pods is more than 90% of total CPU. High CPU usage usually means that something goes wrong.
29+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md
30+
description: >-
31+
This level of CPU utlization of an control plane is probably not a problem under most circumstances, but high levels of utilization may indicate
32+
problems with cluster or control plane pods. When workload partitioning is enabled,
33+
Extreme CPU pressure can cause slow serialization and poor performance from the kube-apiserver and etcd.
34+
When this happens, there is a risk of clients seeing non-responsive API requests which are issued again
35+
causing even more CPU pressure.
36+
It can also cause failing liveness probes due to slow etcd responsiveness on the backend.
37+
If one kube-apiserver fails under this condition, chances are you will experience a cascade as the remaining
38+
kube-apiservers are also under-provisioned.
39+
To fix this, increase the CPU and memory on your control plane nodes.
40+
To manage this alert or modify threshold it in case of false positives see the following link:
41+
https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html
42+
expr: |
43+
sum(rate(container_cpu_usage_seconds_total{namespace=~"openshift-.*",image!=""}[4m])) / ${CPU-COUNT} * 100 > 90
44+
for: 1h
45+
labels:
46+
namespace: openshift-kube-apiserver
47+
severity: critical
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
package highcpuusagealertcontroller
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"strconv"
7+
"time"
8+
9+
configv1 "github.com/openshift/api/config/v1"
10+
configv1informers "github.com/openshift/client-go/config/informers/externalversions/config/v1"
11+
configlistersv1 "github.com/openshift/client-go/config/listers/config/v1"
12+
"github.com/openshift/cluster-kube-apiserver-operator/bindata"
13+
"github.com/openshift/library-go/pkg/controller/factory"
14+
"github.com/openshift/library-go/pkg/operator/events"
15+
"github.com/openshift/library-go/pkg/operator/resource/resourceapply"
16+
"github.com/openshift/library-go/pkg/operator/resource/resourceread"
17+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
18+
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
19+
"k8s.io/apimachinery/pkg/runtime/schema"
20+
"k8s.io/apimachinery/pkg/util/sets"
21+
"k8s.io/client-go/dynamic"
22+
"k8s.io/client-go/dynamic/dynamicinformer"
23+
"k8s.io/utils/cpuset"
24+
)
25+
26+
// default and taken from the docs
27+
const defaultCoresNum = 8
28+
29+
var performanceGroup = schema.GroupVersionResource{Group: "performance.openshift.io", Version: "v2", Resource: "performanceprofiles"}
30+
31+
type highCPUUsageAlertController struct {
32+
client dynamic.Interface
33+
infraLister configlistersv1.InfrastructureLister
34+
clusterVersionLister configlistersv1.ClusterVersionLister
35+
}
36+
37+
func NewHighCPUUsageAlertController(
38+
configInformer configv1informers.Interface,
39+
dynamicInformersForTargetNamespace dynamicinformer.DynamicSharedInformerFactory,
40+
client dynamic.Interface,
41+
recorder events.Recorder,
42+
) factory.Controller {
43+
c := &highCPUUsageAlertController{
44+
client: client,
45+
infraLister: configInformer.Infrastructures().Lister(),
46+
clusterVersionLister: configInformer.ClusterVersions().Lister(),
47+
}
48+
49+
prometheusAlertInformerForTargetNamespace := dynamicInformersForTargetNamespace.ForResource(schema.GroupVersionResource{
50+
Group: "monitoring.coreos.com",
51+
Version: "v1",
52+
Resource: "prometheusrules",
53+
})
54+
55+
return factory.New().
56+
WithInformers(configInformer.Infrastructures().Informer(), configInformer.ClusterVersions().Informer(), prometheusAlertInformerForTargetNamespace.Informer()).
57+
WithSync(c.sync).ResyncEvery(10*time.Minute).
58+
ToController("highCPUUsageAlertController", recorder.WithComponentSuffix("high-cpu-usage-alert-controller"))
59+
}
60+
61+
func (c *highCPUUsageAlertController) sync(ctx context.Context, syncCtx factory.SyncContext) error {
62+
infra, err := c.infraLister.Get("cluster")
63+
if err != nil {
64+
return err
65+
}
66+
67+
var alertRaw []byte
68+
69+
if infra.Status.InfrastructureTopology != configv1.SingleReplicaTopologyMode {
70+
// we moved creation of the alert here because the static resource controller was constantly
71+
// deleting the alert and was fighting with this controller
72+
alertRaw, err = bindata.Asset("assets/alerts/cpu-utilization.yaml")
73+
if err != nil {
74+
return err
75+
}
76+
} else {
77+
clusterVersion, err := c.clusterVersionLister.Get("version")
78+
if err != nil {
79+
return err
80+
}
81+
82+
alertRaw, err = snoAlert(ctx, c.client, clusterVersion.Status.Capabilities.EnabledCapabilities, infra.Status.CPUPartitioning)
83+
if err != nil {
84+
return err
85+
}
86+
}
87+
88+
alertObj, err := resourceread.ReadGenericWithUnstructured(alertRaw)
89+
if err != nil {
90+
return err
91+
}
92+
93+
_, _, err = resourceapply.ApplyPrometheusRule(ctx, c.client, syncCtx.Recorder(), alertObj.(*unstructured.Unstructured))
94+
return err
95+
}
96+
97+
func snoAlert(ctx context.Context, client dynamic.Interface, enabledCapabilities []configv1.ClusterVersionCapability, cpuMode configv1.CPUPartitioningMode) ([]byte, error) {
98+
cores := defaultCoresNum
99+
100+
// if NodeTuning capability disabled, there are no PerformanceProfile, so we proceed
101+
// with default value.
102+
if sets.New(enabledCapabilities...).Has(configv1.ClusterVersionCapabilityNodeTuning) && cpuMode == configv1.CPUPartitioningAllNodes {
103+
foundCores, found, err := performanceProfileControlPlaneCores(ctx, client)
104+
if err != nil {
105+
return nil, err
106+
}
107+
// set cores from PerformanceProfile if expectedToFindCores
108+
// if not, proceed with default values
109+
if found {
110+
cores = foundCores
111+
}
112+
}
113+
114+
fileData, err := bindata.Asset("assets/alerts/cpu-utilization-sno.yaml")
115+
if err != nil {
116+
return nil, err
117+
}
118+
fileData = bytes.ReplaceAll(fileData, []byte(`${CPU-COUNT}`), []byte(strconv.Itoa(cores)))
119+
120+
return fileData, nil
121+
}
122+
123+
// performanceProfileControlPlaneCores returns cores allocated for control plane pods via
124+
// PerformanceProfile object. Bool value indicates if PerformanceProfile is expectedToFindCores for master node
125+
func performanceProfileControlPlaneCores(ctx context.Context, client dynamic.Interface) (int, bool, error) {
126+
// fetch resource directly instead of using an informer because
127+
// NodeTuning capability can be disabled at start and enabled later
128+
obj, err := client.Resource(performanceGroup).List(ctx, metav1.ListOptions{})
129+
if err != nil {
130+
return 0, false, err
131+
}
132+
133+
for _, pf := range obj.Items {
134+
nodeSelector, found, err := unstructured.NestedStringMap(pf.Object, "spec", "nodeSelector")
135+
if err != nil {
136+
return 0, false, err
137+
}
138+
if !found {
139+
continue
140+
}
141+
if _, ok := nodeSelector["node-role.kubernetes.io/master"]; !ok {
142+
continue
143+
}
144+
145+
reservedCPU, found, err := unstructured.NestedString(pf.Object, "spec", "cpu", "reserved")
146+
if err != nil {
147+
return 0, false, err
148+
}
149+
if !found {
150+
continue
151+
}
152+
153+
cores, err := coresInCPUSet(reservedCPU)
154+
if err != nil {
155+
return 0, false, err
156+
}
157+
return cores, true, nil
158+
}
159+
160+
return 0, false, nil
161+
}
162+
163+
func coresInCPUSet(set string) (int, error) {
164+
cpuMap, err := cpuset.Parse(set)
165+
return cpuMap.Size(), err
166+
}

0 commit comments

Comments
 (0)