From 609342e17baa5668ff6b97eac9663d536a34b490 Mon Sep 17 00:00:00 2001 From: wardady Date: Tue, 8 Jul 2025 03:16:35 +0300 Subject: [PATCH 1/2] feat: Add actual pod resources metrics --- docs/metrics/workload/pod-metrics.md | 2 + internal/store/pod.go | 137 +++++++++++++++++++++++++++ pkg/app/server_test.go | 30 ++++++ 3 files changed, 169 insertions(+) diff --git a/docs/metrics/workload/pod-metrics.md b/docs/metrics/workload/pod-metrics.md index 8b8ed9116a..3f3d82348c 100644 --- a/docs/metrics/workload/pod-metrics.md +++ b/docs/metrics/workload/pod-metrics.md @@ -30,7 +30,9 @@ | kube_pod_status_container_ready_time | Gauge | Time when the container of the pod entered Ready state. | seconds | `pod`=<pod-name>
`namespace`=<pod-namespace>
`uid`=<pod-uid> | EXPERIMENTAL | - | | kube_pod_container_status_restarts_total | Counter | The number of container restarts per container | | `container`=<container-name>
`namespace`=<pod-namespace>
`pod`=<pod-name>
`uid`=<pod-uid> | STABLE | - | | kube_pod_container_resource_requests | Gauge | The number of requested request resource by a container. It is recommended to use the `kube_pod_resource_requests` metric exposed by kube-scheduler instead, as it is more precise. | `cpu`=<core>
`memory`=<bytes> | `resource`=<resource-name>
`unit`=<resource-unit>
`container`=<container-name>
`pod`=<pod-name>
`namespace`=<pod-namespace>
`node`=< node-name>
`uid`=<pod-uid> | EXPERIMENTAL | - | +| kube_pod_container_actual_resource_requests | Gauge | The number of actually requested request resource by a container calculated based on status.containerStatuses of a Pod. | `cpu`=<core>
`memory`=<bytes> | `resource`=<resource-name>
`unit`=<resource-unit>
`container`=<container-name>
`pod`=<pod-name>
`namespace`=<pod-namespace>
`node`=< node-name>
`uid`=<pod-uid> | EXPERIMENTAL | - | | kube_pod_container_resource_limits | Gauge | The number of requested limit resource by a container. It is recommended to use the `kube_pod_resource_limits` metric exposed by kube-scheduler instead, as it is more precise. | `cpu`=<core>
`memory`=<bytes> | `resource`=<resource-name>
`unit`=<resource-unit>
`container`=<container-name>
`pod`=<pod-name>
`namespace`=<pod-namespace>
`node`=< node-name>
`uid`=<pod-uid> | EXPERIMENTAL | - | +| kube_pod_container_actual_resource_limits | Gauge | The number of actually requested limit resource by a containercalculated based on status.containerStatuses of a Pod. | `cpu`=<core>
`memory`=<bytes> | `resource`=<resource-name>
`unit`=<resource-unit>
`container`=<container-name>
`pod`=<pod-name>
`namespace`=<pod-namespace>
`node`=< node-name>
`uid`=<pod-uid> | EXPERIMENTAL | - | | kube_pod_overhead_cpu_cores | Gauge | The pod overhead in regards to cpu cores associated with running a pod | core | `pod`=<pod-name>
`namespace`=<pod-namespace>
`uid`=<pod-uid> | EXPERIMENTAL | - | | kube_pod_overhead_memory_bytes | Gauge | The pod overhead in regards to memory associated with running a pod | bytes | `pod`=<pod-name>
`namespace`=<pod-namespace>
`uid`=<pod-uid> | EXPERIMENTAL | - | | kube_pod_runtimeclass_name_info | Gauge | The runtimeclass associated with the pod | | `pod`=<pod-name>
`namespace`=<pod-namespace>
`uid`=<pod-uid> | EXPERIMENTAL | - | diff --git a/internal/store/pod.go b/internal/store/pod.go index f2714e5d0b..64a91cc1a5 100644 --- a/internal/store/pod.go +++ b/internal/store/pod.go @@ -45,7 +45,9 @@ func podMetricFamilies(allowAnnotationsList, allowLabelsList []string) []generat createPodCompletionTimeFamilyGenerator(), createPodContainerInfoFamilyGenerator(), createPodContainerResourceLimitsFamilyGenerator(), + createPodContainerActualResourceLimitsFamilyGenerator(), createPodContainerResourceRequestsFamilyGenerator(), + createPodContainerActualResourceRequestsFamilyGenerator(), createPodContainerStateStartedFamilyGenerator(), createPodContainerStatusLastTerminatedReasonFamilyGenerator(), createPodContainerStatusLastTerminatedExitCodeFamilyGenerator(), @@ -164,6 +166,74 @@ func createPodContainerInfoFamilyGenerator() generator.FamilyGenerator { ) } +func createPodContainerActualResourceLimitsFamilyGenerator() generator.FamilyGenerator { + return *generator.NewFamilyGeneratorWithStability( + "kube_pod_container_actual_resource_limits", + "The number of actually requested limit resource by a container calculated based on status.containerStatuses of a Pod.", + metric.Gauge, + basemetrics.ALPHA, + "", + wrapPodFunc(func(p *v1.Pod) *metric.Family { + ms := []*metric.Metric{} + + for _, c := range p.Status.ContainerStatuses { + if c.Resources == nil { + continue + } + + lim := c.Resources.Limits + + for resourceName, val := range lim { + switch resourceName { + case v1.ResourceCPU: + ms = append(ms, &metric.Metric{ + LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitCore)}, + Value: convertValueToFloat64(&val), + }) + case v1.ResourceStorage: + fallthrough + case v1.ResourceEphemeralStorage: + fallthrough + case v1.ResourceMemory: + ms = append(ms, &metric.Metric{ + LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitByte)}, + Value: float64(val.Value()), + }) + default: + if isHugePageResourceName(resourceName) { + ms = append(ms, &metric.Metric{ + LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitByte)}, + Value: float64(val.Value()), + }) + } + if isAttachableVolumeResourceName(resourceName) { + ms = append(ms, &metric.Metric{ + Value: float64(val.Value()), + LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitByte)}, + }) + } + if isExtendedResourceName(resourceName) { + ms = append(ms, &metric.Metric{ + Value: float64(val.Value()), + LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitInteger)}, + }) + + } + } + } + } + + for _, metric := range ms { + metric.LabelKeys = []string{"container", "node", "resource", "unit"} + } + + return &metric.Family{ + Metrics: ms, + } + }), + ) +} + func createPodContainerResourceLimitsFamilyGenerator() generator.FamilyGenerator { return *generator.NewFamilyGeneratorWithStability( "kube_pod_container_resource_limits", @@ -228,6 +298,73 @@ func createPodContainerResourceLimitsFamilyGenerator() generator.FamilyGenerator ) } +func createPodContainerActualResourceRequestsFamilyGenerator() generator.FamilyGenerator { + return *generator.NewFamilyGeneratorWithStability( + "kube_pod_container_actual_resource_requests", + "The number of actually requested request resource by a container calculated based on status.containerStatuses of a Pod.", + metric.Gauge, + basemetrics.ALPHA, + "", + wrapPodFunc(func(p *v1.Pod) *metric.Family { + ms := []*metric.Metric{} + + for _, c := range p.Status.ContainerStatuses { + if c.Resources == nil { + continue + } + + req := c.Resources.Requests + + for resourceName, val := range req { + switch resourceName { + case v1.ResourceCPU: + ms = append(ms, &metric.Metric{ + LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitCore)}, + Value: convertValueToFloat64(&val), + }) + case v1.ResourceStorage: + fallthrough + case v1.ResourceEphemeralStorage: + fallthrough + case v1.ResourceMemory: + ms = append(ms, &metric.Metric{ + LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitByte)}, + Value: float64(val.Value()), + }) + default: + if isHugePageResourceName(resourceName) { + ms = append(ms, &metric.Metric{ + LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitByte)}, + Value: float64(val.Value()), + }) + } + if isAttachableVolumeResourceName(resourceName) { + ms = append(ms, &metric.Metric{ + LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitByte)}, + Value: float64(val.Value()), + }) + } + if isExtendedResourceName(resourceName) { + ms = append(ms, &metric.Metric{ + LabelValues: []string{c.Name, p.Spec.NodeName, SanitizeLabelName(string(resourceName)), string(constant.UnitInteger)}, + Value: float64(val.Value()), + }) + } + } + } + } + + for _, metric := range ms { + metric.LabelKeys = []string{"container", "node", "resource", "unit"} + } + + return &metric.Family{ + Metrics: ms, + } + }), + ) +} + func createPodContainerResourceRequestsFamilyGenerator() generator.FamilyGenerator { return *generator.NewFamilyGeneratorWithStability( "kube_pod_container_resource_requests", diff --git a/pkg/app/server_test.go b/pkg/app/server_test.go index 40335028da..e75df477f8 100644 --- a/pkg/app/server_test.go +++ b/pkg/app/server_test.go @@ -209,6 +209,8 @@ func TestFullScrapeCycle(t *testing.T) { expected := `# HELP kube_pod_annotations Kubernetes annotations converted to Prometheus labels. # HELP kube_pod_completion_time [STABLE] Completion time in unix timestamp for a pod. +# HELP kube_pod_container_actual_resource_limits The number of actually requested limit resource by a container calculated based on status.containerStatuses of a Pod. +# HELP kube_pod_container_actual_resource_requests The number of actually requested request resource by a container calculated based on status.containerStatuses of a Pod. # HELP kube_pod_container_info [STABLE] Information about a container in a pod. # HELP kube_pod_container_resource_limits The number of requested limit resource by a container. It is recommended to use the kube_pod_resource_limits metric exposed by kube-scheduler instead, as it is more precise. # HELP kube_pod_container_resource_requests The number of requested request resource by a container. It is recommended to use the kube_pod_resource_requests metric exposed by kube-scheduler instead, as it is more precise. @@ -262,6 +264,8 @@ func TestFullScrapeCycle(t *testing.T) { # HELP kube_pod_tolerations Information about the pod tolerations # TYPE kube_pod_annotations gauge # TYPE kube_pod_completion_time gauge +# TYPE kube_pod_container_actual_resource_limits gauge +# TYPE kube_pod_container_actual_resource_requests gauge # TYPE kube_pod_container_info gauge # TYPE kube_pod_container_resource_limits gauge # TYPE kube_pod_container_resource_requests gauge @@ -313,6 +317,16 @@ func TestFullScrapeCycle(t *testing.T) { # TYPE kube_pod_status_scheduled_time gauge # TYPE kube_pod_status_unschedulable gauge # TYPE kube_pod_tolerations gauge +kube_pod_container_actual_resource_limits{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="cpu",unit="core"} 0.3 +kube_pod_container_actual_resource_limits{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="ephemeral_storage",unit="byte"} 4e+08 +kube_pod_container_actual_resource_limits{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="memory",unit="byte"} 2e+08 +kube_pod_container_actual_resource_limits{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="nvidia_com_gpu",unit="integer"} 2 +kube_pod_container_actual_resource_limits{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="storage",unit="byte"} 5e+08 +kube_pod_container_actual_resource_requests{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="cpu",unit="core"} 0.3 +kube_pod_container_actual_resource_requests{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="ephemeral_storage",unit="byte"} 4e+08 +kube_pod_container_actual_resource_requests{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="memory",unit="byte"} 2e+08 +kube_pod_container_actual_resource_requests{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="nvidia_com_gpu",unit="integer"} 2 +kube_pod_container_actual_resource_requests{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="storage",unit="byte"} 5e+08 kube_pod_container_info{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",image_spec="k8s.gcr.io/hyperkube2_spec",image="k8s.gcr.io/hyperkube2",image_id="docker://sha256:bbb",container_id="docker://cd456"} 1 kube_pod_container_info{namespace="default",pod="pod0",uid="abc-0",container="pod1_con2",image_spec="k8s.gcr.io/hyperkube3_spec",image="k8s.gcr.io/hyperkube3",image_id="docker://sha256:ccc",container_id="docker://ef789"} 1 kube_pod_container_resource_limits{namespace="default",pod="pod0",uid="abc-0",container="pod1_con1",node="node1",resource="cpu",unit="core"} 0.2 @@ -850,6 +864,22 @@ func pod(client *fake.Clientset, index int) error { ExitCode: 137, }, }, + Resources: &v1.ResourceRequirements{ + Limits: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("300m"), + v1.ResourceMemory: resource.MustParse("200M"), + v1.ResourceEphemeralStorage: resource.MustParse("400M"), + v1.ResourceStorage: resource.MustParse("500M"), + v1.ResourceName("nvidia.com/gpu"): resource.MustParse("2"), + }, + Requests: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("300m"), + v1.ResourceMemory: resource.MustParse("200M"), + v1.ResourceEphemeralStorage: resource.MustParse("400M"), + v1.ResourceStorage: resource.MustParse("500M"), + v1.ResourceName("nvidia.com/gpu"): resource.MustParse("2"), + }, + }, }, { Name: "pod1_con2", From 422ba2ec77e473c8dc1b198e219f5622f251fd64 Mon Sep 17 00:00:00 2001 From: wardady Date: Fri, 8 Aug 2025 00:45:42 +0300 Subject: [PATCH 2/2] fix: BenchmarkPodStore test adjustment --- internal/store/pod_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/store/pod_test.go b/internal/store/pod_test.go index 113b65d00f..cb64fa023c 100644 --- a/internal/store/pod_test.go +++ b/internal/store/pod_test.go @@ -2282,7 +2282,7 @@ func BenchmarkPodStore(b *testing.B) { }, } - expectedFamilies := 54 + expectedFamilies := 56 for n := 0; n < b.N; n++ { families := f(pod) if len(families) != expectedFamilies {