From 1dbb67891b2c12b2d71177e76ab2f4fd4771ae2a Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Mon, 1 Sep 2025 08:45:19 +0200 Subject: [PATCH] feat: Add rules for IO and network metrics * Include IO and network metrics in Prometheus rules files in `/etc` dir in the repo * Add support for IO and network metrics rules in `ceems_tool` * Update dashboard JSON models to use recorded rules in IO and network metrics panels Signed-off-by: Mahendra Paipuri --- cmd/ceems_tool/rules.go | 8 + cmd/ceems_tool/rules/cpu-cray.rules | 127 + .../rules/cpu-ipmi-redfish-hwmon.rules | 128 + cmd/ceems_tool/rules/cpu-rapl.rules | 128 + .../e2e-test-recording-rules-output.txt | 2625 ++++++++++++++++- cmd/ceems_tool/testdata/prometheus.yml | 1 + .../testdata/rules/fake-usage.rules | 18 + etc/prometheus/README.md | 16 + etc/prometheus/rules/host-usage.rules | 5 +- etc/prometheus/rules/io-usage.rules | 86 + etc/prometheus/rules/network-usage.rules | 65 + scripts/e2e-test.sh | 2 +- .../dashboards/admin/cluster-status.json | 332 ++- .../k8s/k8s-single-pod-metrics.json | 40 +- .../openstack/os-single-vm-metrics.json | 40 +- .../slurm/slurm-single-job-metrics.json | 40 +- 16 files changed, 3563 insertions(+), 98 deletions(-) create mode 100644 cmd/ceems_tool/testdata/rules/fake-usage.rules create mode 100644 etc/prometheus/rules/io-usage.rules create mode 100644 etc/prometheus/rules/network-usage.rules diff --git a/cmd/ceems_tool/rules.go b/cmd/ceems_tool/rules.go index 091bc79c..e8e1e2f4 100644 --- a/cmd/ceems_tool/rules.go +++ b/cmd/ceems_tool/rules.go @@ -56,6 +56,10 @@ var ( amdDevExporterPkgPowerMetric, // AMD metrics device exporter "ceems_compute_unit_gpu_index_flag", "ceems_compute_unit_gpu_sm_count", + "ceems_ebpf_read_bytes_total", + "ceems_ebpf_write_bytes_total", + "ceems_ebpf_ingress_bytes_total", + "ceems_ebpf_egress_bytes_total", } nvidiaProfSeriesNames = []string{ @@ -115,6 +119,8 @@ type rulesTemplateData struct { HostPowerQuery string HostPowerSeries string RAPLAvailable bool + IOAvailable bool + NetAvailable bool Job model.LabelValue PUE float64 EmissionFactor EmissionFactor @@ -363,6 +369,8 @@ func CreatePromRecordingRules( HostPowerQuery: hostPowerQuery, HostPowerSeries: hostPowerSeries, RAPLAvailable: slices.Contains(jobSeries[job], "ceems_rapl_package_joules_total") && slices.Contains(jobSeries[job], "ceems_rapl_dram_joules_total"), + IOAvailable: slices.Contains(jobSeries[job], "ceems_ebpf_read_bytes_total") || slices.Contains(jobSeries[job], "ceems_ebpf_write_bytes_total"), + NetAvailable: slices.Contains(jobSeries[job], "ceems_ebpf_ingress_bytes_total") || slices.Contains(jobSeries[job], "ceems_ebpf_egress_bytes_total"), Job: job, PUE: pueValue, EmissionFactor: emissionFactor, diff --git a/cmd/ceems_tool/rules/cpu-cray.rules b/cmd/ceems_tool/rules/cpu-cray.rules index 77d2dfc9..42225d99 100644 --- a/cmd/ceems_tool/rules/cpu-cray.rules +++ b/cmd/ceems_tool/rules/cpu-cray.rules @@ -220,3 +220,130 @@ groups: {{- end }} {{- end }} +{{ if .IOAvailable }} + + - name: compute-unit-io-rules-{{.Job}} + interval: {{.EvaluationInterval}} + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="{{.Job}}"}[{{.RateInterval}}]) + + - name: host-agg-io-rules-{{.Job}} + interval: {{.EvaluationInterval}} + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) +{{- end }} + +{{- if .NetAvailable }} + + - name: compute-unit-network-rules-{{.Job}} + interval: {{.EvaluationInterval}} + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="{{.Job}}"}[{{.RateInterval}}]) + + - name: host-agg-network-rules-{{.Job}} + interval: {{.EvaluationInterval}} + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) +{{- end }} diff --git a/cmd/ceems_tool/rules/cpu-ipmi-redfish-hwmon.rules b/cmd/ceems_tool/rules/cpu-ipmi-redfish-hwmon.rules index 3688fe7d..de17d99d 100644 --- a/cmd/ceems_tool/rules/cpu-ipmi-redfish-hwmon.rules +++ b/cmd/ceems_tool/rules/cpu-ipmi-redfish-hwmon.rules @@ -245,3 +245,131 @@ groups: ) {{- end }} {{- end }} + +{{- if .IOAvailable }} + + - name: compute-unit-io-rules-{{.Job}} + interval: {{.EvaluationInterval}} + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="{{.Job}}"}[{{.RateInterval}}]) + + - name: host-agg-io-rules-{{.Job}} + interval: {{.EvaluationInterval}} + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) +{{- end }} + +{{- if .NetAvailable }} + + - name: compute-unit-network-rules-{{.Job}} + interval: {{.EvaluationInterval}} + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="{{.Job}}"}[{{.RateInterval}}]) + + - name: host-agg-network-rules-{{.Job}} + interval: {{.EvaluationInterval}} + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) +{{- end }} diff --git a/cmd/ceems_tool/rules/cpu-rapl.rules b/cmd/ceems_tool/rules/cpu-rapl.rules index f92c5ac2..cb44d759 100644 --- a/cmd/ceems_tool/rules/cpu-rapl.rules +++ b/cmd/ceems_tool/rules/cpu-rapl.rules @@ -206,3 +206,131 @@ groups: ) {{- end }} {{- end }} + +{{ if .IOAvailable }} + + - name: compute-unit-io-rules-{{.Job}} + interval: {{.EvaluationInterval}} + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="{{.Job}}"}[{{.RateInterval}}]) + + - name: host-agg-io-rules-{{.Job}} + interval: {{.EvaluationInterval}} + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) +{{- end }} + +{{- if .NetAvailable }} + + - name: compute-unit-network-rules-{{.Job}} + interval: {{.EvaluationInterval}} + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="{{.Job}}"}[{{.RateInterval}}]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="{{.Job}}"}[{{.RateInterval}}]) + + - name: host-agg-network-rules-{{.Job}} + interval: {{.EvaluationInterval}} + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) +{{- end }} diff --git a/cmd/ceems_tool/testdata/output/e2e-test-recording-rules-output.txt b/cmd/ceems_tool/testdata/output/e2e-test-recording-rules-output.txt index 8e7a56ec..9946fdbe 100644 --- a/cmd/ceems_tool/testdata/output/e2e-test-recording-rules-output.txt +++ b/cmd/ceems_tool/testdata/output/e2e-test-recording-rules-output.txt @@ -1,5 +1,5 @@ -Number of series found for job series are: 77 -Number of series found for uuid series are: 714 +Number of series found for job series are: 91 +Number of series found for uuid series are: 728 amd-device-metrics-gpu-gpu.rules --- # Recording rules for AMD GPUs scrape job amd-device-metrics-gpu. @@ -464,6 +464,129 @@ groups: ) ) + + + - name: compute-unit-io-rules-cpu-cray-amd-gpu + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-cray-amd-gpu"}[2s]) + + - name: host-agg-io-rules-cpu-cray-amd-gpu + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-cray-amd-gpu + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-cray-amd-gpu"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-cray-amd-gpu"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-cray-amd-gpu"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-cray-amd-gpu"}[2s]) + + - name: host-agg-network-rules-cpu-cray-amd-gpu + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-hwmon-amd-gpu.rules --- # Recording rules for scrape job cpu-hwmon-amd-gpu @@ -661,6 +784,128 @@ groups: label_replace(ceems_emissions_gCo2_kWh, "job", "cpu-hwmon-amd-gpu", "instance", "(.*)") ) ) + + - name: compute-unit-io-rules-cpu-hwmon-amd-gpu + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-hwmon-amd-gpu"}[2s]) + + - name: host-agg-io-rules-cpu-hwmon-amd-gpu + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-hwmon-amd-gpu + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-hwmon-amd-gpu"}[2s]) + + - name: host-agg-network-rules-cpu-hwmon-amd-gpu + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-ipmi-nvidia-gpu.rules --- # Recording rules for scrape job cpu-ipmi-nvidia-gpu @@ -826,6 +1071,128 @@ groups: label_replace(ceems_emissions_gCo2_kWh, "job", "cpu-ipmi-nvidia-gpu", "instance", "(.*)") ) ) + + - name: compute-unit-io-rules-cpu-ipmi-nvidia-gpu + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + - name: host-agg-io-rules-cpu-ipmi-nvidia-gpu + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-ipmi-nvidia-gpu + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + - name: host-agg-network-rules-cpu-ipmi-nvidia-gpu + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-only-ipmi.rules --- # Recording rules for scrape job cpu-only-ipmi @@ -1023,6 +1390,128 @@ groups: label_replace(ceems_emissions_gCo2_kWh, "job", "cpu-only-ipmi", "instance", "(.*)") ) ) + + - name: compute-unit-io-rules-cpu-only-ipmi + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-only-ipmi"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-only-ipmi"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-only-ipmi"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-only-ipmi"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-only-ipmi"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-only-ipmi"}[2s]) + + - name: host-agg-io-rules-cpu-only-ipmi + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-only-ipmi + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-only-ipmi"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-only-ipmi"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-only-ipmi"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-only-ipmi"}[2s]) + + - name: host-agg-network-rules-cpu-only-ipmi + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-only-rapl.rules --- # Recording rules for scrape job cpu-only-rapl @@ -1185,6 +1674,130 @@ groups: label_replace(ceems_emissions_gCo2_kWh, "job", "cpu-only-rapl", "instance", "(.*)") ) ) + + + + - name: compute-unit-io-rules-cpu-only-rapl + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-only-rapl"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-only-rapl"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-only-rapl"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-only-rapl"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-only-rapl"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-only-rapl"}[2s]) + + - name: host-agg-io-rules-cpu-only-rapl + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-only-rapl + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-only-rapl"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-only-rapl"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-only-rapl"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-only-rapl"}[2s]) + + - name: host-agg-network-rules-cpu-only-rapl + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-only-redfish.rules --- # Recording rules for scrape job cpu-only-redfish @@ -1365,22 +1978,144 @@ groups: * 100 - # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - - record: job:ceems_host_power_watts:pue + # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. + - record: job:ceems_host_power_watts:pue + expr: |2 + sum by (job) (1 * sum without (chassis) (ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"})) + + # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs + # in a Prometheus job accounting PUE value. + # The equivalent emissions are estimated for country FR + - record: job:ceems_host_emissions_g_s:pue + expr: |2 + sum by (job, country_code, country, provider) ( + ( + job:ceems_host_power_watts:pue{job="cpu-only-redfish"} / 3.6e+06 + * on (job) group_right () + label_replace(ceems_emissions_gCo2_kWh, "job", "cpu-only-redfish", "instance", "(.*)") + ) + ) + + - name: compute-unit-io-rules-cpu-only-redfish + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-only-redfish"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-only-redfish"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-only-redfish"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-only-redfish"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-only-redfish"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-only-redfish"}[2s]) + + - name: host-agg-io-rules-cpu-only-redfish + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-only-redfish + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-only-redfish"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-only-redfish"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-only-redfish"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-only-redfish"}[2s]) + + - name: host-agg-network-rules-cpu-only-redfish + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate expr: |2 - sum by (job) (1 * sum without (chassis) (ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"})) + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) - # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs - # in a Prometheus job accounting PUE value. - # The equivalent emissions are estimated for country FR - - record: job:ceems_host_emissions_g_s:pue + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate expr: |2 - sum by (job, country_code, country, provider) ( - ( - job:ceems_host_power_watts:pue{job="cpu-only-redfish"} / 3.6e+06 - * on (job) group_right () - label_replace(ceems_emissions_gCo2_kWh, "job", "cpu-only-redfish", "instance", "(.*)") - ) + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) ) cpu-redfish-nvidia-gpu.rules --- @@ -1579,6 +2314,128 @@ groups: label_replace(ceems_emissions_gCo2_kWh, "job", "cpu-redfish-nvidia-gpu", "instance", "(.*)") ) ) + + - name: compute-unit-io-rules-cpu-redfish-nvidia-gpu + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + - name: host-agg-io-rules-cpu-redfish-nvidia-gpu + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-redfish-nvidia-gpu + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + - name: host-agg-network-rules-cpu-redfish-nvidia-gpu + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) ipmi-nvidia-gpu-gpu.rules --- # Recording rules for NVIDIA GPUs scrape job ipmi-nvidia-gpu. @@ -2472,6 +3329,129 @@ groups: "(.*)" ) + + + - name: compute-unit-io-rules-cpu-cray-amd-gpu + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-cray-amd-gpu"}[2s]) + + - name: host-agg-io-rules-cpu-cray-amd-gpu + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-cray-amd-gpu + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-cray-amd-gpu"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-cray-amd-gpu"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-cray-amd-gpu"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-cray-amd-gpu"}[2s]) + + - name: host-agg-network-rules-cpu-cray-amd-gpu + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-hwmon-amd-gpu.rules --- # Recording rules for scrape job cpu-hwmon-amd-gpu @@ -2674,6 +3654,128 @@ groups: "instance", "(.*)" ) + + - name: compute-unit-io-rules-cpu-hwmon-amd-gpu + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-hwmon-amd-gpu"}[2s]) + + - name: host-agg-io-rules-cpu-hwmon-amd-gpu + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-hwmon-amd-gpu + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-hwmon-amd-gpu"}[2s]) + + - name: host-agg-network-rules-cpu-hwmon-amd-gpu + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-ipmi-nvidia-gpu.rules --- # Recording rules for scrape job cpu-ipmi-nvidia-gpu @@ -2844,6 +3946,128 @@ groups: "instance", "(.*)" ) + + - name: compute-unit-io-rules-cpu-ipmi-nvidia-gpu + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + - name: host-agg-io-rules-cpu-ipmi-nvidia-gpu + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-ipmi-nvidia-gpu + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + - name: host-agg-network-rules-cpu-ipmi-nvidia-gpu + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-only-ipmi.rules --- # Recording rules for scrape job cpu-only-ipmi @@ -3046,6 +4270,128 @@ groups: "instance", "(.*)" ) + + - name: compute-unit-io-rules-cpu-only-ipmi + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-only-ipmi"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-only-ipmi"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-only-ipmi"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-only-ipmi"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-only-ipmi"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-only-ipmi"}[2s]) + + - name: host-agg-io-rules-cpu-only-ipmi + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-only-ipmi + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-only-ipmi"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-only-ipmi"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-only-ipmi"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-only-ipmi"}[2s]) + + - name: host-agg-network-rules-cpu-only-ipmi + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-only-rapl.rules --- # Recording rules for scrape job cpu-only-rapl @@ -3213,6 +4559,130 @@ groups: "instance", "(.*)" ) + + + + - name: compute-unit-io-rules-cpu-only-rapl + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-only-rapl"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-only-rapl"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-only-rapl"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-only-rapl"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-only-rapl"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-only-rapl"}[2s]) + + - name: host-agg-io-rules-cpu-only-rapl + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-only-rapl + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-only-rapl"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-only-rapl"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-only-rapl"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-only-rapl"}[2s]) + + - name: host-agg-network-rules-cpu-only-rapl + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-only-redfish.rules --- # Recording rules for scrape job cpu-only-redfish @@ -3415,6 +4885,128 @@ groups: "instance", "(.*)" ) + + - name: compute-unit-io-rules-cpu-only-redfish + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-only-redfish"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-only-redfish"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-only-redfish"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-only-redfish"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-only-redfish"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-only-redfish"}[2s]) + + - name: host-agg-io-rules-cpu-only-redfish + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-only-redfish + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-only-redfish"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-only-redfish"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-only-redfish"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-only-redfish"}[2s]) + + - name: host-agg-network-rules-cpu-only-redfish + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-redfish-nvidia-gpu.rules --- # Recording rules for scrape job cpu-redfish-nvidia-gpu @@ -3617,6 +5209,128 @@ groups: "instance", "(.*)" ) + + - name: compute-unit-io-rules-cpu-redfish-nvidia-gpu + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + - name: host-agg-io-rules-cpu-redfish-nvidia-gpu + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-redfish-nvidia-gpu + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + - name: host-agg-network-rules-cpu-redfish-nvidia-gpu + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) ipmi-nvidia-gpu-gpu.rules --- # Recording rules for NVIDIA GPUs scrape job ipmi-nvidia-gpu. @@ -4520,6 +6234,129 @@ groups: "(.*)" ) + + + - name: compute-unit-io-rules-cpu-cray-amd-gpu + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-cray-amd-gpu"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-cray-amd-gpu"}[2s]) + + - name: host-agg-io-rules-cpu-cray-amd-gpu + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-cray-amd-gpu + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-cray-amd-gpu"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-cray-amd-gpu"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-cray-amd-gpu"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-cray-amd-gpu"}[2s]) + + - name: host-agg-network-rules-cpu-cray-amd-gpu + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-hwmon-amd-gpu.rules --- # Recording rules for scrape job cpu-hwmon-amd-gpu @@ -4709,18 +6546,140 @@ groups: # - record: job:ceems_host_emissions_g_s:pue expr: |2 - label_replace( - label_replace( - 50 * job:ceems_host_power_watts:pue{job="cpu-hwmon-amd-gpu"} / 3.6e+06, - "provider", - "custom", - "instance", - "(.*)" - ), - "country_code", - "", - "instance", - "(.*)" + label_replace( + label_replace( + 50 * job:ceems_host_power_watts:pue{job="cpu-hwmon-amd-gpu"} / 3.6e+06, + "provider", + "custom", + "instance", + "(.*)" + ), + "country_code", + "", + "instance", + "(.*)" + ) + + - name: compute-unit-io-rules-cpu-hwmon-amd-gpu + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-hwmon-amd-gpu"}[2s]) + + - name: host-agg-io-rules-cpu-hwmon-amd-gpu + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-hwmon-amd-gpu + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-hwmon-amd-gpu"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-hwmon-amd-gpu"}[2s]) + + - name: host-agg-network-rules-cpu-hwmon-amd-gpu + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) ) cpu-ipmi-nvidia-gpu.rules --- @@ -4892,6 +6851,128 @@ groups: "instance", "(.*)" ) + + - name: compute-unit-io-rules-cpu-ipmi-nvidia-gpu + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + - name: host-agg-io-rules-cpu-ipmi-nvidia-gpu + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-ipmi-nvidia-gpu + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + + - name: host-agg-network-rules-cpu-ipmi-nvidia-gpu + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-only-ipmi.rules --- # Recording rules for scrape job cpu-only-ipmi @@ -5094,6 +7175,128 @@ groups: "instance", "(.*)" ) + + - name: compute-unit-io-rules-cpu-only-ipmi + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-only-ipmi"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-only-ipmi"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-only-ipmi"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-only-ipmi"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-only-ipmi"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-only-ipmi"}[2s]) + + - name: host-agg-io-rules-cpu-only-ipmi + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-only-ipmi + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-only-ipmi"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-only-ipmi"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-only-ipmi"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-only-ipmi"}[2s]) + + - name: host-agg-network-rules-cpu-only-ipmi + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-only-rapl.rules --- # Recording rules for scrape job cpu-only-rapl @@ -5261,6 +7464,130 @@ groups: "instance", "(.*)" ) + + + + - name: compute-unit-io-rules-cpu-only-rapl + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-only-rapl"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-only-rapl"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-only-rapl"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-only-rapl"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-only-rapl"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-only-rapl"}[2s]) + + - name: host-agg-io-rules-cpu-only-rapl + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-only-rapl + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-only-rapl"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-only-rapl"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-only-rapl"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-only-rapl"}[2s]) + + - name: host-agg-network-rules-cpu-only-rapl + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-only-redfish.rules --- # Recording rules for scrape job cpu-only-redfish @@ -5463,6 +7790,128 @@ groups: "instance", "(.*)" ) + + - name: compute-unit-io-rules-cpu-only-redfish + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-only-redfish"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-only-redfish"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-only-redfish"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-only-redfish"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-only-redfish"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-only-redfish"}[2s]) + + - name: host-agg-io-rules-cpu-only-redfish + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-only-redfish + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-only-redfish"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-only-redfish"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-only-redfish"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-only-redfish"}[2s]) + + - name: host-agg-network-rules-cpu-only-redfish + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) cpu-redfish-nvidia-gpu.rules --- # Recording rules for scrape job cpu-redfish-nvidia-gpu @@ -5665,6 +8114,128 @@ groups: "instance", "(.*)" ) + + - name: compute-unit-io-rules-cpu-redfish-nvidia-gpu + interval: 1s + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + - name: host-agg-io-rules-cpu-redfish-nvidia-gpu + interval: 1s + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) + + - name: compute-unit-network-rules-cpu-redfish-nvidia-gpu + interval: 1s + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total{job="cpu-redfish-nvidia-gpu"}[2s]) + + - name: host-agg-network-rules-cpu-redfish-nvidia-gpu + interval: 1s + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) ipmi-nvidia-gpu-gpu.rules --- # Recording rules for NVIDIA GPUs scrape job ipmi-nvidia-gpu. diff --git a/cmd/ceems_tool/testdata/prometheus.yml b/cmd/ceems_tool/testdata/prometheus.yml index a804a908..acf6e515 100644 --- a/cmd/ceems_tool/testdata/prometheus.yml +++ b/cmd/ceems_tool/testdata/prometheus.yml @@ -7,6 +7,7 @@ global: rule_files: - ../../../etc/prometheus/rules/*.rules + - rules/*.rules # scrape configuration scrape_configs: diff --git a/cmd/ceems_tool/testdata/rules/fake-usage.rules b/cmd/ceems_tool/testdata/rules/fake-usage.rules new file mode 100644 index 00000000..12cf26c5 --- /dev/null +++ b/cmd/ceems_tool/testdata/rules/fake-usage.rules @@ -0,0 +1,18 @@ +# Fake I/O and network metrics to test rules generation. These metrics +# come from eBPF collector which is hard to reproduce just with fixtures. + +groups: + - name: fake-usage-metrics + rules: + # Fake I/O metric + - record: ceems_ebpf_read_bytes_total + labels: + mountpoint: /fakemount + expr: 10 * ceems_cpu_count / ceems_cpu_count + + # Fake network metric + - record: ceems_ebpf_ingress_bytes_total + labels: + proto: tcp + family: ipv4 + expr: 10 * ceems_cpu_count / ceems_cpu_count diff --git a/etc/prometheus/README.md b/etc/prometheus/README.md index d95af9dc..951cd1cc 100644 --- a/etc/prometheus/README.md +++ b/etc/prometheus/README.md @@ -23,6 +23,22 @@ must be preferred to generate recording rules. The rules in this file estimate the host CPU and CPU memory usage for each compute unit and also average usage aggregated over Prometheus jobs. +### [`io-usage.rules`](./rules/io-usage.rules) + +The rules in this file estimate the I/O read/write bandwidths for each +compute unit and also total usage aggregated over Prometheus jobs. These +metrics are available only when +[eBPF](https://ceems-dev.github.io/ceems/docs/components/ceems-exporter#ebpf-sub-collector) +collector is enabled on CEEMS exporter. + +### [`network-usage.rules`](./rules/network-usage.rules) + +The rules in this file estimate the network ingress and egress bandwidths for each +compute unit and also total usage aggregated over Prometheus jobs. These +metrics are available only when +[eBPF](https://ceems-dev.github.io/ceems/docs/components/ceems-exporter#ebpf-sub-collector) +collector is enabled on CEEMS exporter. + ### [`host-power-cray-pmc.rules`](./rules/host-power-cray-pmc.rules) The rules defined in this file estimate host power usage for the nodes where Cray diff --git a/etc/prometheus/rules/host-usage.rules b/etc/prometheus/rules/host-usage.rules index 54b30e33..4a85123a 100644 --- a/etc/prometheus/rules/host-usage.rules +++ b/etc/prometheus/rules/host-usage.rules @@ -2,7 +2,7 @@ # Recording rules for compute unit host CPU and memory usage # # The rules estimate the compute unit's CPU and CPU memory usage -# metrics +# metrics, I/O metrics and network metrics # # Optional placeholders to replace: # @@ -34,8 +34,7 @@ groups: / (ceems_compute_unit_memory_total_bytes > 0) - # The following recording rules estimate the average CPU, CPU memory usages and - # total host power (excluding GPUs) and its equivalent emissions aggregared for all hosts + # The following recording rules estimate the average CPU, CPU memory usages aggregared for all hosts # per Prometheus job. - name: host-agg-cpu-usage-rules # interval: diff --git a/etc/prometheus/rules/io-usage.rules b/etc/prometheus/rules/io-usage.rules new file mode 100644 index 00000000..66b64e2c --- /dev/null +++ b/etc/prometheus/rules/io-usage.rules @@ -0,0 +1,86 @@ +--- +# Recording rules for compute unit I/O usage +# +# The rules estimate the compute unit's I/O usage metrics +# +# Optional placeholders to replace: +# +# : Evaluation interval +# +# By default rate interval of 1m is used. For scrape intervals more than 30s, use a bigger +# rate interval. +# +groups: + - name: io-usage-rules + # interval: + rules: + # I/O read bandwidth in bytes/sec. + - record: uuid:ceems_io_read_bytes:irate + expr: irate(ceems_ebpf_read_bytes_total[1m]) + + # I/O read requests rate in req/sec. + - record: uuid:ceems_io_read_requests:irate + expr: irate(ceems_ebpf_read_requests_total[1m]) + + # I/O read errors rate in err/sec. + - record: uuid:ceems_io_read_errors:irate + expr: irate(ceems_ebpf_read_errors_total[1m]) + + # I/O write bandwidth in bytes/sec. + - record: uuid:ceems_io_write_bytes:irate + expr: irate(ceems_ebpf_write_bytes_total[1m]) + + # I/O write requests rate in req/sec. + - record: uuid:ceems_io_write_requests:irate + expr: irate(ceems_ebpf_write_requests_total[1m]) + + # I/O write errors rate in err/sec. + - record: uuid:ceems_io_write_errors:irate + expr: irate(ceems_ebpf_write_errors_total[1m]) + + # The following recording rules estimate the total I/O usages aggregared for all hosts + # per Prometheus job. + - name: host-agg-io-usage-rules + # interval: + rules: + # Total I/O read bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_bytes_total[1m])) + ) + + # Total I/O read requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_requests_total[1m])) + ) + + # Total I/O read errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_read_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_read_errors_total[1m])) + ) + + # Total I/O write bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_bytes:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_bytes_total[1m])) + ) + + # Total I/O write requests in req/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_requests:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_requests_total[1m])) + ) + + # Total I/O write errors in err/sec for all hosts in a Prometheus job. + - record: job:ceems_io_write_errors:sum_irate + expr: |2 + sum by (job, mountpoint) ( + sum by (job, mountpoint, instance) (irate(ceems_ebpf_write_errors_total[1m])) + ) diff --git a/etc/prometheus/rules/network-usage.rules b/etc/prometheus/rules/network-usage.rules new file mode 100644 index 00000000..3c148879 --- /dev/null +++ b/etc/prometheus/rules/network-usage.rules @@ -0,0 +1,65 @@ +--- +# Recording rules for compute unit network usage +# +# The rules estimate the compute unit's ingress and egress +# network usage metrics +# +# Optional placeholders to replace: +# +# : Evaluation interval +# +# By default rate interval of 1m is used. For scrape intervals more than 30s, use a bigger +# rate interval. +# +groups: + - name: network-usage-rules + # interval: + rules: + # Netwok ingress bandwidth in bytes/sec. + - record: uuid:ceems_net_ingress_bytes:irate + expr: irate(ceems_ebpf_ingress_bytes_total[1m]) + + # Netwok ingress packets rate in pkts/sec. + - record: uuid:ceems_net_ingress_packets:irate + expr: irate(ceems_ebpf_ingress_packets_total[1m]) + + # Netwok egress bandwidth in bytes/sec. + - record: uuid:ceems_net_egress_bytes:irate + expr: irate(ceems_ebpf_egress_bytes_total[1m]) + + # Netwok egress packets rate in pkts/sec. + - record: uuid:ceems_net_egress_packets:irate + expr: irate(ceems_ebpf_egress_packets_total[1m]) + + # The following recording rules estimate the total network usages aggregared for all hosts + # per Prometheus job. + - name: host-agg-network-usage-rules + # interval: + rules: + # Total network ingress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_bytes_total[1m])) + ) + + # Total network ingress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_ingress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_ingress_packets_total[1m])) + ) + + # Total network egress bandwidth in bytes/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_bytes:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_bytes_total[1m])) + ) + + # Total network egress packets rate in pkts/sec for all hosts in a Prometheus job. + - record: job:ceems_net_egress_packets:sum_irate + expr: |2 + sum by (job, family, proto) ( + sum by (job, family, proto, instance) (irate(ceems_ebpf_egress_packets_total[1m])) + ) diff --git a/scripts/e2e-test.sh b/scripts/e2e-test.sh index 4dd2d3a4..1630bb57 100755 --- a/scripts/e2e-test.sh +++ b/scripts/e2e-test.sh @@ -1754,7 +1754,7 @@ then cp cmd/ceems_tool/testdata/prometheus.yml "${tmpdir}/prometheus.yml" # Ignore existing recording rules - sed -i 's/prometheus/prometheus1/g' "${tmpdir}/prometheus.yml" + sed -i 's/rules/rules1/g' "${tmpdir}/prometheus.yml" prometheus \ --config.file "${tmpdir}/prometheus.yml" \ diff --git a/thirdparty/grafana/dashboards/admin/cluster-status.json b/thirdparty/grafana/dashboards/admin/cluster-status.json index eabeee58..3ac1aa87 100644 --- a/thirdparty/grafana/dashboards/admin/cluster-status.json +++ b/thirdparty/grafana/dashboards/admin/cluster-status.json @@ -2,7 +2,7 @@ "__inputs": [ { "name": "DS_PROMETHEUS", - "label": "Prometheus datasource. Use vanilla Prometheus even when CEEMS LB is enabled.", + "label": "Prometheus datasource.", "description": "", "type": "datasource", "pluginId": "prometheus", @@ -1272,7 +1272,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1401,7 +1402,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1530,7 +1532,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1659,7 +1662,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1789,7 +1793,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1943,7 +1948,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2044,6 +2050,318 @@ } ], "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total I/O bandwidth from each job", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic", + "seriesBy": "last" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 52 + }, + "id": 94, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (job,mountpoint) (job:ceems_io_read_bytes:sum_irate{job=~\"${job:pipe}\"})", + "instant": false, + "legendFormat": "Read I/O on {{mountpoint}} - {{job}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (job,mountpoint) (job:ceems_io_write_bytes:sum_irate{job=~\"${job:pipe}\"})", + "hide": false, + "instant": false, + "legendFormat": "Write I/O on {{mountpoint}} - {{job}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(job:ceems_io_read_bytes:sum_irate)", + "hide": false, + "instant": false, + "legendFormat": "Total Read I/O", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(job:ceems_io_write_bytes:sum_irate)", + "hide": false, + "instant": false, + "legendFormat": "Total Write I/O", + "range": true, + "refId": "D" + } + ], + "title": "Total I/O Usage", + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "(.*) - (?:ceems|dcgm)-(.*)", + "renamePattern": "$1 - $2" + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total network ingress/egress bandwidths from each job", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic", + "seriesBy": "last" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 52 + }, + "id": 95, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (job,proto,family) (job:ceems_net_ingress_bytes:sum_irate{job=~\"${job:pipe}\"})", + "instant": false, + "legendFormat": "Ingress {{proto}} {{family}} - {{job}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (job,proto,family) (job:ceems_net_egress_bytes:sum_irate{job=~\"${job:pipe}\"})", + "hide": false, + "instant": false, + "legendFormat": "Egress {{proto}} {{family}} - {{job}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(job:ceems_net_ingress_bytes:sum_irate)", + "hide": false, + "instant": false, + "legendFormat": "Total Ingress", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(job:ceems_net_egress_bytes:sum_irate)", + "hide": false, + "instant": false, + "legendFormat": "Total Egress", + "range": true, + "refId": "D" + } + ], + "title": "Total Network Usage", + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "(.*) - (?:ceems|dcgm)-(.*)", + "renamePattern": "$1 - $2" + } + } + ], + "type": "timeseries" } ], "title": "Overall Usage Stats in Time from ${__from:date:YYYY-MM-DD HH:mm:ss} to ${__to:date:YYYY-MM-DD HH:mm:ss}", diff --git a/thirdparty/grafana/dashboards/k8s/k8s-single-pod-metrics.json b/thirdparty/grafana/dashboards/k8s/k8s-single-pod-metrics.json index 02bbda96..311f91b1 100644 --- a/thirdparty/grafana/dashboards/k8s/k8s-single-pod-metrics.json +++ b/thirdparty/grafana/dashboards/k8s/k8s-single-pod-metrics.json @@ -2698,7 +2698,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_read_bytes_total{uuid=\"${uuid}\"}[$__rate_interval])", + "expr": "uuid:ceems_io_read_bytes:irate{uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "Read Bandwith on {{mountpoint}} from {{hostname}}", @@ -2711,7 +2711,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_write_bytes_total{uuid=\"${uuid}\"}[$__rate_interval])", + "expr": "uuid:ceems_io_write_bytes:irate{uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "Write Bandwith on {{mountpoint}} from {{hostname}}", @@ -2724,7 +2724,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (mountpoint) (irate(ceems_ebpf_read_bytes_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum by (mountpoint) (uuid:ceems_io_read_bytes:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Total Read Bandwith from {{mountpoint}}", @@ -2737,7 +2737,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (mountpoint) (irate(ceems_ebpf_write_bytes_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum by (mountpoint) (uuid:ceems_io_write_bytes:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Total Write Bandwith from {{mountpoint}}", @@ -2842,7 +2842,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_read_requests_total{uuid=\"${uuid}\"}[$__rate_interval])", + "expr": "uuid:ceems_io_read_requests:irate{uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "Read Requests on {{mountpoint}} from {{hostname}}", @@ -2855,7 +2855,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_write_requests_total{uuid=\"${uuid}\"}[$__rate_interval])", + "expr": "uuid:ceems_io_write_requests:irate{uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "Write Requests on {{mountpoint}} from {{hostname}}", @@ -2868,7 +2868,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (mountpoint) (irate(ceems_ebpf_read_requests_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum by (mountpoint) (uuid:ceems_io_read_requests:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Total Read Requests on {{mountpoint}}", @@ -2881,7 +2881,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (mountpoint) (irate(ceems_ebpf_write_requests_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum by (mountpoint) (uuid:ceems_io_write_requests:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Total Write Requests on {{mountpoint}}", @@ -2986,7 +2986,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_read_errors_total{uuid=\"${uuid}\"}[$__rate_interval])", + "expr": "uuid:ceems_io_read_errors:irate{uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "Read Errors on {{mountpoint}} from {{hostname}}", @@ -2999,7 +2999,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_write_errors_total{uuid=\"${uuid}\"}[$__rate_interval])", + "expr": "uuid:ceems_io_write_errors:irate{uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "Write Errors on {{mountpoint}} from {{hostname}}", @@ -3012,7 +3012,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (mountpoint) (irate(ceems_ebpf_read_errors_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum by (mountpoint) (uuid:ceems_io_read_errors:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Total Read Errors on {{mountpoint}}", @@ -3025,7 +3025,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (mountpoint) (irate(ceems_ebpf_write_errors_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum by (mountpoint) (uuid:ceems_io_write_errors:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Total Write Errors on {{mountpoint}}", @@ -3139,7 +3139,7 @@ "targets": [ { "editorMode": "code", - "expr": "irate(ceems_ebpf_ingress_bytes_total{uuid=\"${uuid}\"}[$__rate_interval])", + "expr": "uuid:ceems_net_ingress_bytes:irate{uuid=\"${uuid}\"}", "legendFormat": "Ingress Bandwith {{proto}}/{{family}} on {{hostname}}", "range": true, "refId": "A" @@ -3150,7 +3150,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_egress_bytes_total{uuid=\"${uuid}\"}[$__rate_interval])", + "expr": "uuid:ceems_net_egress_bytes:irate{uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "Egress Bandwith {{proto}}/{{family}} on {{hostname}}", @@ -3163,7 +3163,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (proto,family) (irate(ceems_ebpf_ingress_bytes_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum by (proto,family) (uuid:ceems_net_ingress_bytes:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Total Ingress Bandwith {{proto}}/{{family}}", @@ -3176,7 +3176,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (proto,family) (irate(ceems_ebpf_egress_bytes_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum by (proto,family) (uuid:ceems_net_egress_bytes:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Total Egress Bandwith {{proto}}/{{family}}", @@ -3276,7 +3276,7 @@ "targets": [ { "editorMode": "code", - "expr": "irate(ceems_ebpf_ingress_packets_total{uuid=\"${uuid}\"}[$__rate_interval])", + "expr": "uuid:ceems_net_ingress_packets:irate{uuid=\"${uuid}\"}", "legendFormat": "Ingress Packets {{proto}}/{{family}} on {{hostname}}", "range": true, "refId": "A" @@ -3287,7 +3287,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_egress_packets_total{uuid=\"${uuid}\"}[$__rate_interval])", + "expr": "uuid:ceems_net_egress_packets:irate{uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "Egress Packets {{proto}}/{{family}} on {{hostname}}", @@ -3300,7 +3300,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (proto,family) (irate(ceems_ebpf_ingress_packets_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum by (proto,family) (uuid:ceems_net_ingress_packets:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Total Ingress Packets {{proto}}/{{family}}", @@ -3313,7 +3313,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (proto,family) (irate(ceems_ebpf_egress_packets_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum by (proto,family) (uuid:ceems_net_egress_packets:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Total Egress Packets {{proto}}/{{family}}", diff --git a/thirdparty/grafana/dashboards/openstack/os-single-vm-metrics.json b/thirdparty/grafana/dashboards/openstack/os-single-vm-metrics.json index f871e235..f3b48d7c 100644 --- a/thirdparty/grafana/dashboards/openstack/os-single-vm-metrics.json +++ b/thirdparty/grafana/dashboards/openstack/os-single-vm-metrics.json @@ -1869,7 +1869,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (irate(ceems_ebpf_read_bytes_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum without (instance,hostname) (uuid:ceems_io_read_bytes:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Read Bandwith on {{mountpoint}}", @@ -1882,7 +1882,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (irate(ceems_ebpf_write_bytes_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum without (instance,hostname) (uuid:ceems_io_write_bytes:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Write Bandwith on {{mountpoint}}", @@ -1895,7 +1895,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (sum by (mountpoint) (irate(ceems_ebpf_read_bytes_total{uuid=\"${uuid}\"}[$__rate_interval])))", + "expr": "sum without (instance,hostname) (sum by (mountpoint) (uuid:ceems_io_read_bytes:irate{uuid=\"${uuid}\"}))", "hide": false, "instant": false, "legendFormat": "Total Read Bandwith on {{mountpoint}}", @@ -1908,7 +1908,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (sum by (mountpoint) (irate(ceems_ebpf_write_bytes_total{uuid=\"${uuid}\"}[$__rate_interval])))", + "expr": "sum without (instance,hostname) (sum by (mountpoint) (uuid:ceems_io_write_bytes:irate{uuid=\"${uuid}\"}))", "hide": false, "instant": false, "legendFormat": "Total Write Bandwith on {{mountpoint}}", @@ -2013,7 +2013,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (irate(ceems_ebpf_read_requests_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum without (instance,hostname) (uuid:ceems_io_read_requests:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Read Requests on {{mountpoint}}", @@ -2026,7 +2026,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (irate(ceems_ebpf_write_requests_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum without (instance,hostname) (uuid:ceems_io_write_requests:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Write Requests on {{mountpoint}}", @@ -2039,7 +2039,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (sum by (mountpoint) (irate(ceems_ebpf_read_requests_total{uuid=\"${uuid}\"}[$__rate_interval])))", + "expr": "sum without (instance,hostname) (sum by (mountpoint) (uuid:ceems_io_read_requests:irate{uuid=\"${uuid}\"}))", "hide": false, "instant": false, "legendFormat": "Total Read Requests on {{mountpoint}}", @@ -2052,7 +2052,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (sum by (mountpoint) (irate(ceems_ebpf_write_requests_total{uuid=\"${uuid}\"}[$__rate_interval])))", + "expr": "sum without (instance,hostname) (sum by (mountpoint) (uuid:ceems_io_write_requests:irate{uuid=\"${uuid}\"}))", "hide": false, "instant": false, "legendFormat": "Total Write Requests on {{mountpoint}}", @@ -2157,7 +2157,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (irate(ceems_ebpf_read_errors_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum without (instance,hostname) (uuid:ceems_io_read_errors:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Read Errors on {{mountpoint}}", @@ -2170,7 +2170,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (irate(ceems_ebpf_write_errors_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum without (instance,hostname) (uuid:ceems_io_write_errors:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Write Errors on {{mountpoint}}", @@ -2183,7 +2183,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (sum by (mountpoint) (irate(ceems_ebpf_read_errors_total{uuid=\"${uuid}\"}[$__rate_interval])))", + "expr": "sum without (instance,hostname) (sum by (mountpoint) (uuid:ceems_io_read_errors:irate{uuid=\"${uuid}\"}))", "hide": false, "instant": false, "legendFormat": "Total Read Errors on {{mountpoint}}", @@ -2196,7 +2196,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (sum by (mountpoint) (irate(ceems_ebpf_write_errors_total{uuid=\"${uuid}\"}[$__rate_interval])))", + "expr": "sum without (instance,hostname) (sum by (mountpoint) (uuid:ceems_io_write_errors:irate{uuid=\"${uuid}\"}))", "hide": false, "instant": false, "legendFormat": "Total Write Errors on {{mountpoint}}", @@ -2309,7 +2309,7 @@ "targets": [ { "editorMode": "code", - "expr": "sum without (instance,hostname) (irate(ceems_ebpf_ingress_bytes_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum without (instance,hostname) (uuid:ceems_net_ingress_bytes:irate{uuid=\"${uuid}\"})", "legendFormat": "Ingress Bandwith {{proto}}/{{family}}", "range": true, "refId": "A", @@ -2324,7 +2324,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (irate(ceems_ebpf_egress_bytes_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum without (instance,hostname) (uuid:ceems_net_egress_bytes:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Egress Bandwith {{proto}}/{{family}}", @@ -2337,7 +2337,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (sum by (proto,family) (irate(ceems_ebpf_ingress_bytes_total{uuid=\"${uuid}\"}[$__rate_interval])))", + "expr": "sum without (instance,hostname) (sum by (proto,family) (uuid:ceems_net_ingress_bytes:irate{uuid=\"${uuid}\"}))", "hide": false, "instant": false, "legendFormat": "Total Ingress Bandwith {{proto}}/{{family}}", @@ -2350,7 +2350,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (sum by (proto,family) (irate(ceems_ebpf_egress_bytes_total{uuid=\"${uuid}\"}[$__rate_interval])))", + "expr": "sum without (instance,hostname) (sum by (proto,family) (uuid:ceems_net_egress_bytes:irate{uuid=\"${uuid}\"}))", "hide": false, "instant": false, "legendFormat": "Total Egress Bandwith {{proto}}/{{family}}", @@ -2449,7 +2449,7 @@ "targets": [ { "editorMode": "code", - "expr": "sum without (instance,hostname) (irate(ceems_ebpf_ingress_packets_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum without (instance,hostname) (uuid:ceems_net_ingress_packets:irate{uuid=\"${uuid}\"})", "legendFormat": "Ingress Packets {{proto}}/{{family}}", "range": true, "refId": "A", @@ -2464,7 +2464,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (irate(ceems_ebpf_egress_packets_total{uuid=\"${uuid}\"}[$__rate_interval]))", + "expr": "sum without (instance,hostname) (uuid:ceems_net_egress_packets:irate{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Egress Packets {{proto}}/{{family}}", @@ -2477,7 +2477,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (sum by (proto,family) (irate(ceems_ebpf_ingress_packets_total{uuid=\"${uuid}\"}[$__rate_interval])))", + "expr": "sum without (instance,hostname) (sum by (proto,family) (uuid:ceems_net_ingress_packets:irate{uuid=\"${uuid}\"}))", "hide": false, "instant": false, "legendFormat": "Total Ingress Packets {{proto}}/{{family}}", @@ -2490,7 +2490,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (sum by (proto,family) (irate(ceems_ebpf_egress_packets_total{uuid=\"${uuid}\"}[$__rate_interval])))", + "expr": "sum without (instance,hostname) (sum by (proto,family) (uuid:ceems_net_egress_packets:irate{uuid=\"${uuid}\"}))", "hide": false, "instant": false, "legendFormat": "Total Egress Packets {{proto}}/{{family}}", diff --git a/thirdparty/grafana/dashboards/slurm/slurm-single-job-metrics.json b/thirdparty/grafana/dashboards/slurm/slurm-single-job-metrics.json index 681e3c8f..cb61c5bb 100644 --- a/thirdparty/grafana/dashboards/slurm/slurm-single-job-metrics.json +++ b/thirdparty/grafana/dashboards/slurm/slurm-single-job-metrics.json @@ -2698,7 +2698,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_read_bytes_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval])", + "expr": "uuid:ceems_io_read_bytes:irate{instance=~\"${host}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "Read Bandwith on {{mountpoint}} from {{hostname}}", @@ -2711,7 +2711,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_write_bytes_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval])", + "expr": "uuid:ceems_io_write_bytes:irate{instance=~\"${host}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "Write Bandwith on {{mountpoint}} from {{hostname}}", @@ -2724,7 +2724,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (mountpoint) (irate(ceems_ebpf_read_bytes_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval]))", + "expr": "sum by (mountpoint) (uuid:ceems_io_read_bytes:irate{instance=~\"${host}\",uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Total Read Bandwith from {{mountpoint}}", @@ -2737,7 +2737,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (mountpoint) (irate(ceems_ebpf_write_bytes_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval]))", + "expr": "sum by (mountpoint) (uuid:ceems_io_write_bytes:irate{instance=~\"${host}\",uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Total Write Bandwith from {{mountpoint}}", @@ -2842,7 +2842,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_read_requests_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval])", + "expr": "uuid:ceems_io_read_requests:irate{instance=~\"${host}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "Read Requests on {{mountpoint}} from {{hostname}}", @@ -2855,7 +2855,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_write_requests_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval])", + "expr": "uuid:ceems_io_write_requests:irate{instance=~\"${host}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "Write Requests on {{mountpoint}} from {{hostname}}", @@ -2868,7 +2868,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (mountpoint) (irate(ceems_ebpf_read_requests_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval]))", + "expr": "sum by (mountpoint) (uuid:ceems_io_read_requests:irate{instance=~\"${host}\",uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Total Read Requests on {{mountpoint}}", @@ -2881,7 +2881,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (mountpoint) (irate(ceems_ebpf_write_requests_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval]))", + "expr": "sum by (mountpoint) (uuid:ceems_io_write_requests:irate{instance=~\"${host}\",uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Total Write Requests on {{mountpoint}}", @@ -2986,7 +2986,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_read_errors_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval])", + "expr": "uuid:ceems_io_read_errors:irate{instance=~\"${host}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "Read Errors on {{mountpoint}} from {{hostname}}", @@ -2999,7 +2999,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_write_errors_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval])", + "expr": "uuid:ceems_io_write_errors:irate{instance=~\"${host}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "Write Errors on {{mountpoint}} from {{hostname}}", @@ -3012,7 +3012,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (mountpoint) (irate(ceems_ebpf_read_errors_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval]))", + "expr": "sum by (mountpoint) (uuid:ceems_io_read_errors:irate{instance=~\"${host}\",uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Total Read Errors on {{mountpoint}}", @@ -3025,7 +3025,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (mountpoint) (irate(ceems_ebpf_write_errors_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval]))", + "expr": "sum by (mountpoint) (uuid:ceems_io_write_errors:irate{instance=~\"${host}\",uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Total Write Errors on {{mountpoint}}", @@ -3139,7 +3139,7 @@ "targets": [ { "editorMode": "code", - "expr": "irate(ceems_ebpf_ingress_bytes_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval])", + "expr": "uuid:ceems_net_ingress_bytes:irate{instance=~\"${host}\",uuid=\"${jobid}\"}", "legendFormat": "Ingress Bandwith {{proto}}/{{family}} on {{hostname}}", "range": true, "refId": "A" @@ -3150,7 +3150,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_egress_bytes_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval])", + "expr": "uuid:ceems_net_egress_bytes:irate{instance=~\"${host}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "Egress Bandwith {{proto}}/{{family}} on {{hostname}}", @@ -3163,7 +3163,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (proto,family) (irate(ceems_ebpf_ingress_bytes_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval]))", + "expr": "sum by (proto,family) (uuid:ceems_net_ingress_bytes:irate{instance=~\"${host}\",uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Total Ingress Bandwith {{proto}}/{{family}}", @@ -3176,7 +3176,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (proto,family) (irate(ceems_ebpf_egress_bytes_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval]))", + "expr": "sum by (proto,family) (uuid:ceems_net_egress_bytes:irate{instance=~\"${host}\",uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Total Egress Bandwith {{proto}}/{{family}}", @@ -3276,7 +3276,7 @@ "targets": [ { "editorMode": "code", - "expr": "irate(ceems_ebpf_ingress_packets_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval])", + "expr": "uuid:ceems_net_ingress_packets:irate{instance=~\"${host}\",uuid=\"${jobid}\"}", "legendFormat": "Ingress Packets {{proto}}/{{family}} on {{hostname}}", "range": true, "refId": "A" @@ -3287,7 +3287,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "irate(ceems_ebpf_egress_packets_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval])", + "expr": "uuid:ceems_net_egress_packets:irate{instance=~\"${host}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "Egress Packets {{proto}}/{{family}} on {{hostname}}", @@ -3300,7 +3300,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (proto,family) (irate(ceems_ebpf_ingress_packets_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval]))", + "expr": "sum by (proto,family) (uuid:ceems_net_ingress_packets:irate{instance=~\"${host}\",uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Total Ingress Packets {{proto}}/{{family}}", @@ -3313,7 +3313,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (proto,family) (irate(ceems_ebpf_egress_packets_total{instance=~\"${host}\",uuid=\"${jobid}\"}[$__rate_interval]))", + "expr": "sum by (proto,family) (uuid:ceems_net_egress_packets:irate{instance=~\"${host}\",uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Total Egress Packets {{proto}}/{{family}}",