|
| 1 | +package highcpuusagealertcontroller |
| 2 | + |
| 3 | +import ( |
| 4 | + "bytes" |
| 5 | + "context" |
| 6 | + "strconv" |
| 7 | + "time" |
| 8 | + |
| 9 | + configv1 "github.com/openshift/api/config/v1" |
| 10 | + configv1informers "github.com/openshift/client-go/config/informers/externalversions/config/v1" |
| 11 | + configlistersv1 "github.com/openshift/client-go/config/listers/config/v1" |
| 12 | + "github.com/openshift/cluster-kube-apiserver-operator/bindata" |
| 13 | + "github.com/openshift/library-go/pkg/controller/factory" |
| 14 | + "github.com/openshift/library-go/pkg/operator/events" |
| 15 | + "github.com/openshift/library-go/pkg/operator/resource/resourceapply" |
| 16 | + "github.com/openshift/library-go/pkg/operator/resource/resourceread" |
| 17 | + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
| 18 | + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" |
| 19 | + "k8s.io/apimachinery/pkg/runtime/schema" |
| 20 | + "k8s.io/apimachinery/pkg/util/sets" |
| 21 | + "k8s.io/client-go/dynamic" |
| 22 | + "k8s.io/client-go/dynamic/dynamicinformer" |
| 23 | + "k8s.io/utils/cpuset" |
| 24 | +) |
| 25 | + |
| 26 | +// default and taken from the docs |
| 27 | +const defaultCoresNum = 8 |
| 28 | + |
| 29 | +var performanceGroup = schema.GroupVersionResource{Group: "performance.openshift.io", Version: "v2", Resource: "performanceprofiles"} |
| 30 | + |
| 31 | +type highCPUUsageAlertController struct { |
| 32 | + client dynamic.Interface |
| 33 | + infraLister configlistersv1.InfrastructureLister |
| 34 | + clusterVersionLister configlistersv1.ClusterVersionLister |
| 35 | +} |
| 36 | + |
| 37 | +func NewHighCPUUsageAlertController( |
| 38 | + configInformer configv1informers.Interface, |
| 39 | + dynamicInformersForTargetNamespace dynamicinformer.DynamicSharedInformerFactory, |
| 40 | + client dynamic.Interface, |
| 41 | + recorder events.Recorder, |
| 42 | +) factory.Controller { |
| 43 | + c := &highCPUUsageAlertController{ |
| 44 | + client: client, |
| 45 | + infraLister: configInformer.Infrastructures().Lister(), |
| 46 | + clusterVersionLister: configInformer.ClusterVersions().Lister(), |
| 47 | + } |
| 48 | + |
| 49 | + prometheusAlertInformerForTargetNamespace := dynamicInformersForTargetNamespace.ForResource(schema.GroupVersionResource{ |
| 50 | + Group: "monitoring.coreos.com", |
| 51 | + Version: "v1", |
| 52 | + Resource: "prometheusrules", |
| 53 | + }) |
| 54 | + |
| 55 | + return factory.New(). |
| 56 | + WithInformers(configInformer.Infrastructures().Informer(), configInformer.ClusterVersions().Informer(), prometheusAlertInformerForTargetNamespace.Informer()). |
| 57 | + WithSync(c.sync).ResyncEvery(10*time.Minute). |
| 58 | + ToController("highCPUUsageAlertController", recorder.WithComponentSuffix("high-cpu-usage-alert-controller")) |
| 59 | +} |
| 60 | + |
| 61 | +func (c *highCPUUsageAlertController) sync(ctx context.Context, syncCtx factory.SyncContext) error { |
| 62 | + infra, err := c.infraLister.Get("cluster") |
| 63 | + if err != nil { |
| 64 | + return err |
| 65 | + } |
| 66 | + |
| 67 | + var alertRaw []byte |
| 68 | + |
| 69 | + if infra.Status.InfrastructureTopology != configv1.SingleReplicaTopologyMode { |
| 70 | + // we moved creation of the alert here because the static resource controller was constantly |
| 71 | + // deleting the alert and was fighting with this controller |
| 72 | + alertRaw, err = bindata.Asset("assets/alerts/cpu-utilization.yaml") |
| 73 | + if err != nil { |
| 74 | + return err |
| 75 | + } |
| 76 | + } else { |
| 77 | + clusterVersion, err := c.clusterVersionLister.Get("version") |
| 78 | + if err != nil { |
| 79 | + return err |
| 80 | + } |
| 81 | + |
| 82 | + alertRaw, err = snoAlert(ctx, c.client, clusterVersion.Status.Capabilities.EnabledCapabilities, infra.Status.CPUPartitioning) |
| 83 | + if err != nil { |
| 84 | + return err |
| 85 | + } |
| 86 | + } |
| 87 | + |
| 88 | + alertObj, err := resourceread.ReadGenericWithUnstructured(alertRaw) |
| 89 | + if err != nil { |
| 90 | + return err |
| 91 | + } |
| 92 | + |
| 93 | + _, _, err = resourceapply.ApplyPrometheusRule(ctx, c.client, syncCtx.Recorder(), alertObj.(*unstructured.Unstructured)) |
| 94 | + return err |
| 95 | +} |
| 96 | + |
| 97 | +func snoAlert(ctx context.Context, client dynamic.Interface, enabledCapabilities []configv1.ClusterVersionCapability, cpuMode configv1.CPUPartitioningMode) ([]byte, error) { |
| 98 | + cores := defaultCoresNum |
| 99 | + |
| 100 | + // if NodeTuning capability disabled, there are no PerformanceProfile, so we proceed |
| 101 | + // with default value. |
| 102 | + if sets.New(enabledCapabilities...).Has(configv1.ClusterVersionCapabilityNodeTuning) && cpuMode == configv1.CPUPartitioningAllNodes { |
| 103 | + foundCores, found, err := performanceProfileControlPlaneCores(ctx, client) |
| 104 | + if err != nil { |
| 105 | + return nil, err |
| 106 | + } |
| 107 | + // set cores from PerformanceProfile if expectedToFindCores |
| 108 | + // if not, proceed with default values |
| 109 | + if found { |
| 110 | + cores = foundCores |
| 111 | + } |
| 112 | + } |
| 113 | + |
| 114 | + fileData, err := bindata.Asset("assets/alerts/cpu-utilization-sno.yaml") |
| 115 | + if err != nil { |
| 116 | + return nil, err |
| 117 | + } |
| 118 | + fileData = bytes.ReplaceAll(fileData, []byte(`${CPU-COUNT}`), []byte(strconv.Itoa(cores))) |
| 119 | + |
| 120 | + return fileData, nil |
| 121 | +} |
| 122 | + |
| 123 | +// performanceProfileControlPlaneCores returns cores allocated for control plane pods via |
| 124 | +// PerformanceProfile object. Bool value indicates if PerformanceProfile is expectedToFindCores for master node |
| 125 | +func performanceProfileControlPlaneCores(ctx context.Context, client dynamic.Interface) (int, bool, error) { |
| 126 | + // fetch resource directly instead of using an informer because |
| 127 | + // NodeTuning capability can be disabled at start and enabled later |
| 128 | + obj, err := client.Resource(performanceGroup).List(ctx, metav1.ListOptions{}) |
| 129 | + if err != nil { |
| 130 | + return 0, false, err |
| 131 | + } |
| 132 | + |
| 133 | + for _, pf := range obj.Items { |
| 134 | + nodeSelector, found, err := unstructured.NestedStringMap(pf.Object, "spec", "nodeSelector") |
| 135 | + if err != nil { |
| 136 | + return 0, false, err |
| 137 | + } |
| 138 | + if !found { |
| 139 | + continue |
| 140 | + } |
| 141 | + if _, ok := nodeSelector["node-role.kubernetes.io/master"]; !ok { |
| 142 | + continue |
| 143 | + } |
| 144 | + |
| 145 | + reservedCPU, found, err := unstructured.NestedString(pf.Object, "spec", "cpu", "reserved") |
| 146 | + if err != nil { |
| 147 | + return 0, false, err |
| 148 | + } |
| 149 | + if !found { |
| 150 | + continue |
| 151 | + } |
| 152 | + |
| 153 | + cores, err := coresInCPUSet(reservedCPU) |
| 154 | + if err != nil { |
| 155 | + return 0, false, err |
| 156 | + } |
| 157 | + return cores, true, nil |
| 158 | + } |
| 159 | + |
| 160 | + return 0, false, nil |
| 161 | +} |
| 162 | + |
| 163 | +func coresInCPUSet(set string) (int, error) { |
| 164 | + cpuMap, err := cpuset.Parse(set) |
| 165 | + return cpuMap.Size(), err |
| 166 | +} |
0 commit comments