diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bad8ea5c..16ac2a78 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -87,7 +87,7 @@ jobs: docker-test: needs: [docker] uses: ./.github/workflows/step_tests-remote-images.yml - if: github.event_name == 'push' && contains(github.ref, 'refs/tags/') + if: contains(github.ref, 'refs/heads/main') # Since workflow_dispatch inputs are only available on manual triggers # we need to set default values to the context vars here with: @@ -112,7 +112,7 @@ jobs: quay-test: needs: [quay] uses: ./.github/workflows/step_tests-remote-images.yml - if: github.event_name == 'push' && contains(github.ref, 'refs/tags/') + if: contains(github.ref, 'refs/heads/main') # Since workflow_dispatch inputs are only available on manual triggers # we need to set default values to the context vars here with: diff --git a/CHANGELOG.md b/CHANGELOG.md index 81c9a5cc..9b958b49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,22 @@ # Changelog +## 0.11.0 / 2025-*-* + +### Breaking Changes + +#### CEEMS Exporter + +- Collector `ipmi_dcmi` has been renamed to `ipmi` as more functionality beyond DCMI has been added to the collector. +- Following metric labels have been renamed to be more consistent with Prometheus naming convention: + * `ceems_ipmi_dcmi_current_watts` -> `ceems_ipmi_dcmi_power_current_watts` + * `ceems_ipmi_dcmi_min_watts` -> `ceems_ipmi_dcmi_power_min_watts` + * `ceems_ipmi_dcmi_max_watts` -> `ceems_ipmi_dcmi_power_max_watts` + * `ceems_ipmi_dcmi_avg_watts` -> `ceems_ipmi_dcmi_power_avg_watts` + * `ceems_redfish_current_watts` -> `ceems_redfish_power_current_watts` + * `ceems_redfish_min_watts` -> `ceems_redfish_power_min_watts` + * `ceems_redfish_max_watts` -> `ceems_redfish_power_max_watts` + * `ceems_redfish_avg_watts` -> `ceems_redfish_power_avg_watts` + ## 0.10.2 / 2025-08-07 - [BUGFIX] Fix bpf code to work with LLVM 20 [#393](https://github.com/mahendrapaipuri/ceems/pull/393) ([@mahendrapaipuri](https://github.com/mahendrapaipuri)) diff --git a/cmd/ceems_exporter/main_test.go b/cmd/ceems_exporter/main_test.go index 0c48db8d..f402a6b2 100644 --- a/cmd/ceems_exporter/main_test.go +++ b/cmd/ceems_exporter/main_test.go @@ -48,7 +48,7 @@ func TestFileDescriptorLeak(t *testing.T) { "--web.listen-address", address, "--path.cgroupfs", sysfsPath, "--path.procfs", procfsPath, - "--collector.ipmi_dcmi.cmd", "pkg/collector/testdata/ipmi/freeipmi/ipmi-dcmi", + "--collector.ipmi.dcmi.cmd", "pkg/collector/testdata/ipmi/freeipmi/ipmi-dcmi", // "--no-security.drop-privileges", ) test := func(pid int) error { diff --git a/cmd/ceems_tool/rules.go b/cmd/ceems_tool/rules.go index a7abfcab..4b13a77c 100644 --- a/cmd/ceems_tool/rules.go +++ b/cmd/ceems_tool/rules.go @@ -27,15 +27,21 @@ import ( //go:embed rules var rulesFS embed.FS +const ( + ipmiPowerMetric = "ceems_ipmi_dcmi_power_current_watts" + redfishPowerMetric = "ceems_redfish_power_current_watts" + crayPowerMetric = "ceems_cray_pm_counters_power_watts" +) + var ( seriesNames = []string{ "ceems_compute_unit_cpu_user_seconds_total", "ceems_compute_unit_memory_used_bytes", "ceems_rapl_package_joules_total", "ceems_rapl_dram_joules_total", - "ceems_ipmi_dcmi_current_watts", - "ceems_redfish_current_watts", - "ceems_cray_pm_counters_power_watts", + ipmiPowerMetric, + redfishPowerMetric, + crayPowerMetric, "ceems_emissions_gCo2_kWh", "DCGM_FI_DEV_POWER_USAGE_INSTANT", "amd_gpu_power", @@ -241,15 +247,15 @@ func CreatePromRecordingRules( var hostPowerSeries string switch { - case slices.Contains(jobSeries[job], "ceems_cray_pm_counters_power_watts"): + case slices.Contains(jobSeries[job], crayPowerMetric): tmplFile = "cpu-cray.rules" - hostPowerSeries = "ceems_cray_pm_counters_power_watts" - case slices.Contains(jobSeries[job], "ceems_redfish_current_watts"): + hostPowerSeries = crayPowerMetric + case slices.Contains(jobSeries[job], redfishPowerMetric): tmplFile = "cpu-ipmi-redfish.rules" - hostPowerSeries = "ceems_redfish_current_watts" - case slices.Contains(jobSeries[job], "ceems_ipmi_dcmi_current_watts"): + hostPowerSeries = redfishPowerMetric + case slices.Contains(jobSeries[job], ipmiPowerMetric): tmplFile = "cpu-ipmi-redfish.rules" - hostPowerSeries = "ceems_ipmi_dcmi_current_watts" + hostPowerSeries = ipmiPowerMetric case slices.Contains(jobSeries[job], "ceems_rapl_package_joules_total"): tmplFile = "cpu-rapl.rules" hostPowerSeries = "ceems_rapl_package_joules_total" @@ -264,8 +270,8 @@ func CreatePromRecordingRules( var hostPowerLabel string - if hostPowerSeries == "ceems_redfish_current_watts" { - matcher := fmt.Sprintf(`ceems_redfish_current_watts{job="%s"}`, job) + if hostPowerSeries == redfishPowerMetric { + matcher := fmt.Sprintf(`%s{job="%s"}`, redfishPowerMetric, job) chassis, _, err := api.LabelValues(ctx, "chassis", []string{matcher}, stime, etime) // Ignoring warnings for now. if err != nil { @@ -277,7 +283,7 @@ func CreatePromRecordingRules( // If there are more than 1 chassis, emit log for operators to tell them to // choose appropriate chassis to get CPU power usage if len(chassis) > 1 { - fmt.Fprintln(os.Stderr, "Multiple chassis found for ceems_redfish_current_watts for job", job) + fmt.Fprintln(os.Stderr, "Multiple chassis found for", redfishPowerMetric, "for job", job) fmt.Fprintln(os.Stderr, "Choose the chassis that reports host power usage") for ichas, chas := range chassis { @@ -315,9 +321,9 @@ func CreatePromRecordingRules( } else if len(chassis) == 1 { targetChassis = chassis[0] } else { - fmt.Fprintln(os.Stderr, "no chassis found for ceems_redfish_current_watts for job", job) + fmt.Fprintln(os.Stderr, "no chassis found for", redfishPowerMetric, "for job", job) - return errors.New("no chassis found for ceems_redfish_current_watts") + return fmt.Errorf("no chassis found for %s", redfishPowerMetric) } // If targetChassis is found, set up label diff --git a/cmd/ceems_tool/testdata/output/e2e-test-recording-rules-output.txt b/cmd/ceems_tool/testdata/output/e2e-test-recording-rules-output.txt index d571e421..98b3af1b 100644 --- a/cmd/ceems_tool/testdata/output/e2e-test-recording-rules-output.txt +++ b/cmd/ceems_tool/testdata/output/e2e-test-recording-rules-output.txt @@ -325,8 +325,8 @@ groups: (ceems_compute_unit_memory_total_bytes{job="cpu-ipmi-nvidia-gpu"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_ipmi_dcmi_current_watts:pue - expr: 1 * (label_replace(ceems_ipmi_dcmi_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 + - record: instance:ceems_ipmi_dcmi_power_current_watts:pue + expr: 1 * (label_replace(ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -359,7 +359,7 @@ groups: # - record: uuid:ceems_host_power_watts:pue expr: |2 - 0.9 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_right () # Total CPU Power * (Compute CPU Time / Total CPU Time) -> Compute Unit CPU Power ( ( @@ -371,7 +371,7 @@ groups: sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-ipmi-nvidia-gpu",mode!~"idle|iowait|steal"}[2s])) ) + - 0.1 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Total Misc Power Usage + 0.1 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( ceems_compute_unit_memory_used_bytes{job="cpu-ipmi-nvidia-gpu"} @@ -440,7 +440,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * (label_replace(ceems_ipmi_dcmi_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) + sum by (job) (1 * (label_replace(ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -490,8 +490,8 @@ groups: (ceems_compute_unit_memory_total_bytes{job="cpu-only-ipmi"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_ipmi_dcmi_current_watts:pue - expr: 1 * ceems_ipmi_dcmi_current_watts{job="cpu-only-ipmi"} + - record: instance:ceems_ipmi_dcmi_power_current_watts:pue + expr: 1 * ceems_ipmi_dcmi_power_current_watts{job="cpu-only-ipmi"} # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -524,7 +524,7 @@ groups: # - record: uuid:ceems_host_power_watts:pue expr: |2 - 0.9 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-only-ipmi"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power ( sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-only-ipmi"}[2s])) @@ -546,7 +546,7 @@ groups: sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-only-ipmi",mode!~"idle|iowait|steal"}[2s])) ) + - 0.9 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-only-ipmi"} + 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power ( sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-only-ipmi"}[2s])) @@ -568,7 +568,7 @@ groups: ) ) + - 0.1 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-only-ipmi"} # Total Misc Power Usage + 0.1 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( ceems_compute_unit_memory_used_bytes{job="cpu-only-ipmi"} @@ -637,7 +637,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * ceems_ipmi_dcmi_current_watts{job="cpu-only-ipmi"}) + sum by (job) (1 * ceems_ipmi_dcmi_power_current_watts{job="cpu-only-ipmi"}) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -849,8 +849,8 @@ groups: (ceems_compute_unit_memory_total_bytes{job="cpu-only-redfish"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_redfish_current_watts:pue - expr: 1 * ceems_redfish_current_watts{job="cpu-only-redfish",chassis="Chassis_1"} + - record: instance:ceems_redfish_power_current_watts:pue + expr: 1 * ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"} # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -883,7 +883,7 @@ groups: # - record: uuid:ceems_host_power_watts:pue expr: |2 - 0.9 * instance:ceems_redfish_current_watts:pue{job="cpu-only-redfish"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_redfish_power_current_watts:pue{job="cpu-only-redfish"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power ( sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-only-redfish"}[2s])) @@ -905,7 +905,7 @@ groups: sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-only-redfish",mode!~"idle|iowait|steal"}[2s])) ) + - 0.9 * instance:ceems_redfish_current_watts:pue{job="cpu-only-redfish"} + 0.9 * instance:ceems_redfish_power_current_watts:pue{job="cpu-only-redfish"} * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power ( sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-only-redfish"}[2s])) @@ -927,7 +927,7 @@ groups: ) ) + - 0.1 * instance:ceems_redfish_current_watts:pue{job="cpu-only-redfish"} # Total Misc Power Usage + 0.1 * instance:ceems_redfish_power_current_watts:pue{job="cpu-only-redfish"} # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( ceems_compute_unit_memory_used_bytes{job="cpu-only-redfish"} @@ -996,7 +996,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * ceems_redfish_current_watts{job="cpu-only-redfish",chassis="Chassis_1"}) + sum by (job) (1 * ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"}) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -1046,8 +1046,8 @@ groups: (ceems_compute_unit_memory_total_bytes{job="cpu-redfish-nvidia-gpu"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_redfish_current_watts:pue - expr: 1 * ceems_redfish_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_1"} + - record: instance:ceems_redfish_power_current_watts:pue + expr: 1 * ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_1"} # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -1080,7 +1080,7 @@ groups: # - record: uuid:ceems_host_power_watts:pue expr: |2 - 0.9 * instance:ceems_redfish_current_watts:pue{job="cpu-redfish-nvidia-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_redfish_power_current_watts:pue{job="cpu-redfish-nvidia-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power ( sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-redfish-nvidia-gpu"}[2s])) @@ -1102,7 +1102,7 @@ groups: sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-redfish-nvidia-gpu",mode!~"idle|iowait|steal"}[2s])) ) + - 0.9 * instance:ceems_redfish_current_watts:pue{job="cpu-redfish-nvidia-gpu"} + 0.9 * instance:ceems_redfish_power_current_watts:pue{job="cpu-redfish-nvidia-gpu"} * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power ( sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-redfish-nvidia-gpu"}[2s])) @@ -1124,7 +1124,7 @@ groups: ) ) + - 0.1 * instance:ceems_redfish_current_watts:pue{job="cpu-redfish-nvidia-gpu"} # Total Misc Power Usage + 0.1 * instance:ceems_redfish_power_current_watts:pue{job="cpu-redfish-nvidia-gpu"} # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( ceems_compute_unit_memory_used_bytes{job="cpu-redfish-nvidia-gpu"} @@ -1193,7 +1193,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * ceems_redfish_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_1"}) + sum by (job) (1 * ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_1"}) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -1860,8 +1860,8 @@ groups: (ceems_compute_unit_memory_total_bytes{job="cpu-ipmi-nvidia-gpu"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_ipmi_dcmi_current_watts:pue - expr: 1 * (label_replace(ceems_ipmi_dcmi_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 + - record: instance:ceems_ipmi_dcmi_power_current_watts:pue + expr: 1 * (label_replace(ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -1894,7 +1894,7 @@ groups: # - record: uuid:ceems_host_power_watts:pue expr: |2 - 0.9 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_right () # Total CPU Power * (Compute CPU Time / Total CPU Time) -> Compute Unit CPU Power ( ( @@ -1906,7 +1906,7 @@ groups: sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-ipmi-nvidia-gpu",mode!~"idle|iowait|steal"}[2s])) ) + - 0.1 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Total Misc Power Usage + 0.1 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( ceems_compute_unit_memory_used_bytes{job="cpu-ipmi-nvidia-gpu"} @@ -1973,7 +1973,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * (label_replace(ceems_ipmi_dcmi_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) + sum by (job) (1 * (label_replace(ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -2030,8 +2030,8 @@ groups: (ceems_compute_unit_memory_total_bytes{job="cpu-only-ipmi"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_ipmi_dcmi_current_watts:pue - expr: 1 * ceems_ipmi_dcmi_current_watts{job="cpu-only-ipmi"} + - record: instance:ceems_ipmi_dcmi_power_current_watts:pue + expr: 1 * ceems_ipmi_dcmi_power_current_watts{job="cpu-only-ipmi"} # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -2064,7 +2064,7 @@ groups: # - record: uuid:ceems_host_power_watts:pue expr: |2 - 0.9 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-only-ipmi"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power ( sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-only-ipmi"}[2s])) @@ -2086,7 +2086,7 @@ groups: sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-only-ipmi",mode!~"idle|iowait|steal"}[2s])) ) + - 0.9 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-only-ipmi"} + 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power ( sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-only-ipmi"}[2s])) @@ -2108,7 +2108,7 @@ groups: ) ) + - 0.1 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-only-ipmi"} # Total Misc Power Usage + 0.1 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( ceems_compute_unit_memory_used_bytes{job="cpu-only-ipmi"} @@ -2175,7 +2175,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * ceems_ipmi_dcmi_current_watts{job="cpu-only-ipmi"}) + sum by (job) (1 * ceems_ipmi_dcmi_power_current_watts{job="cpu-only-ipmi"}) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -2399,8 +2399,8 @@ groups: (ceems_compute_unit_memory_total_bytes{job="cpu-only-redfish"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_redfish_current_watts:pue - expr: 1 * ceems_redfish_current_watts{job="cpu-only-redfish",chassis="Chassis_1"} + - record: instance:ceems_redfish_power_current_watts:pue + expr: 1 * ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"} # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -2433,7 +2433,7 @@ groups: # - record: uuid:ceems_host_power_watts:pue expr: |2 - 0.9 * instance:ceems_redfish_current_watts:pue{job="cpu-only-redfish"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_redfish_power_current_watts:pue{job="cpu-only-redfish"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power ( sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-only-redfish"}[2s])) @@ -2455,7 +2455,7 @@ groups: sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-only-redfish",mode!~"idle|iowait|steal"}[2s])) ) + - 0.9 * instance:ceems_redfish_current_watts:pue{job="cpu-only-redfish"} + 0.9 * instance:ceems_redfish_power_current_watts:pue{job="cpu-only-redfish"} * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power ( sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-only-redfish"}[2s])) @@ -2477,7 +2477,7 @@ groups: ) ) + - 0.1 * instance:ceems_redfish_current_watts:pue{job="cpu-only-redfish"} # Total Misc Power Usage + 0.1 * instance:ceems_redfish_power_current_watts:pue{job="cpu-only-redfish"} # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( ceems_compute_unit_memory_used_bytes{job="cpu-only-redfish"} @@ -2544,7 +2544,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * ceems_redfish_current_watts{job="cpu-only-redfish",chassis="Chassis_1"}) + sum by (job) (1 * ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"}) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -2601,8 +2601,8 @@ groups: (ceems_compute_unit_memory_total_bytes{job="cpu-redfish-nvidia-gpu"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_redfish_current_watts:pue - expr: 1 * (label_replace(ceems_redfish_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_2"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 + - record: instance:ceems_redfish_power_current_watts:pue + expr: 1 * (label_replace(ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_2"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -2635,7 +2635,7 @@ groups: # - record: uuid:ceems_host_power_watts:pue expr: |2 - 0.9 * instance:ceems_redfish_current_watts:pue{job="cpu-redfish-nvidia-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_redfish_power_current_watts:pue{job="cpu-redfish-nvidia-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power ( sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-redfish-nvidia-gpu"}[2s])) @@ -2657,7 +2657,7 @@ groups: sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-redfish-nvidia-gpu",mode!~"idle|iowait|steal"}[2s])) ) + - 0.9 * instance:ceems_redfish_current_watts:pue{job="cpu-redfish-nvidia-gpu"} + 0.9 * instance:ceems_redfish_power_current_watts:pue{job="cpu-redfish-nvidia-gpu"} * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power ( sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-redfish-nvidia-gpu"}[2s])) @@ -2679,7 +2679,7 @@ groups: ) ) + - 0.1 * instance:ceems_redfish_current_watts:pue{job="cpu-redfish-nvidia-gpu"} # Total Misc Power Usage + 0.1 * instance:ceems_redfish_power_current_watts:pue{job="cpu-redfish-nvidia-gpu"} # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( ceems_compute_unit_memory_used_bytes{job="cpu-redfish-nvidia-gpu"} @@ -2746,7 +2746,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * (label_replace(ceems_redfish_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_2"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) + sum by (job) (1 * (label_replace(ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_2"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -3418,8 +3418,8 @@ groups: (ceems_compute_unit_memory_total_bytes{job="cpu-ipmi-nvidia-gpu"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_ipmi_dcmi_current_watts:pue - expr: 1 * (label_replace(ceems_ipmi_dcmi_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 + - record: instance:ceems_ipmi_dcmi_power_current_watts:pue + expr: 1 * (label_replace(ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -3452,7 +3452,7 @@ groups: # - record: uuid:ceems_host_power_watts:pue expr: |2 - 0.9 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_right () # Total CPU Power * (Compute CPU Time / Total CPU Time) -> Compute Unit CPU Power ( ( @@ -3464,7 +3464,7 @@ groups: sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-ipmi-nvidia-gpu",mode!~"idle|iowait|steal"}[2s])) ) + - 0.1 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Total Misc Power Usage + 0.1 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( ceems_compute_unit_memory_used_bytes{job="cpu-ipmi-nvidia-gpu"} @@ -3531,7 +3531,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * (label_replace(ceems_ipmi_dcmi_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) + sum by (job) (1 * (label_replace(ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -3588,8 +3588,8 @@ groups: (ceems_compute_unit_memory_total_bytes{job="cpu-only-ipmi"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_ipmi_dcmi_current_watts:pue - expr: 1 * ceems_ipmi_dcmi_current_watts{job="cpu-only-ipmi"} + - record: instance:ceems_ipmi_dcmi_power_current_watts:pue + expr: 1 * ceems_ipmi_dcmi_power_current_watts{job="cpu-only-ipmi"} # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -3622,7 +3622,7 @@ groups: # - record: uuid:ceems_host_power_watts:pue expr: |2 - 0.9 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-only-ipmi"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power ( sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-only-ipmi"}[2s])) @@ -3644,7 +3644,7 @@ groups: sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-only-ipmi",mode!~"idle|iowait|steal"}[2s])) ) + - 0.9 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-only-ipmi"} + 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power ( sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-only-ipmi"}[2s])) @@ -3666,7 +3666,7 @@ groups: ) ) + - 0.1 * instance:ceems_ipmi_dcmi_current_watts:pue{job="cpu-only-ipmi"} # Total Misc Power Usage + 0.1 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( ceems_compute_unit_memory_used_bytes{job="cpu-only-ipmi"} @@ -3733,7 +3733,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * ceems_ipmi_dcmi_current_watts{job="cpu-only-ipmi"}) + sum by (job) (1 * ceems_ipmi_dcmi_power_current_watts{job="cpu-only-ipmi"}) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -3957,8 +3957,8 @@ groups: (ceems_compute_unit_memory_total_bytes{job="cpu-only-redfish"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_redfish_current_watts:pue - expr: 1 * ceems_redfish_current_watts{job="cpu-only-redfish",chassis="Chassis_1"} + - record: instance:ceems_redfish_power_current_watts:pue + expr: 1 * ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"} # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -3991,7 +3991,7 @@ groups: # - record: uuid:ceems_host_power_watts:pue expr: |2 - 0.9 * instance:ceems_redfish_current_watts:pue{job="cpu-only-redfish"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_redfish_power_current_watts:pue{job="cpu-only-redfish"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power ( sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-only-redfish"}[2s])) @@ -4013,7 +4013,7 @@ groups: sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-only-redfish",mode!~"idle|iowait|steal"}[2s])) ) + - 0.9 * instance:ceems_redfish_current_watts:pue{job="cpu-only-redfish"} + 0.9 * instance:ceems_redfish_power_current_watts:pue{job="cpu-only-redfish"} * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power ( sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-only-redfish"}[2s])) @@ -4035,7 +4035,7 @@ groups: ) ) + - 0.1 * instance:ceems_redfish_current_watts:pue{job="cpu-only-redfish"} # Total Misc Power Usage + 0.1 * instance:ceems_redfish_power_current_watts:pue{job="cpu-only-redfish"} # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( ceems_compute_unit_memory_used_bytes{job="cpu-only-redfish"} @@ -4102,7 +4102,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * ceems_redfish_current_watts{job="cpu-only-redfish",chassis="Chassis_1"}) + sum by (job) (1 * ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"}) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -4159,8 +4159,8 @@ groups: (ceems_compute_unit_memory_total_bytes{job="cpu-redfish-nvidia-gpu"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_redfish_current_watts:pue - expr: 1 * (label_replace(ceems_redfish_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_2"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 + - record: instance:ceems_redfish_power_current_watts:pue + expr: 1 * (label_replace(ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_2"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -4193,7 +4193,7 @@ groups: # - record: uuid:ceems_host_power_watts:pue expr: |2 - 0.9 * instance:ceems_redfish_current_watts:pue{job="cpu-redfish-nvidia-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_redfish_power_current_watts:pue{job="cpu-redfish-nvidia-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power ( sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-redfish-nvidia-gpu"}[2s])) @@ -4215,7 +4215,7 @@ groups: sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-redfish-nvidia-gpu",mode!~"idle|iowait|steal"}[2s])) ) + - 0.9 * instance:ceems_redfish_current_watts:pue{job="cpu-redfish-nvidia-gpu"} + 0.9 * instance:ceems_redfish_power_current_watts:pue{job="cpu-redfish-nvidia-gpu"} * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power ( sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-redfish-nvidia-gpu"}[2s])) @@ -4237,7 +4237,7 @@ groups: ) ) + - 0.1 * instance:ceems_redfish_current_watts:pue{job="cpu-redfish-nvidia-gpu"} # Total Misc Power Usage + 0.1 * instance:ceems_redfish_power_current_watts:pue{job="cpu-redfish-nvidia-gpu"} # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( ceems_compute_unit_memory_used_bytes{job="cpu-redfish-nvidia-gpu"} @@ -4304,7 +4304,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * (label_replace(ceems_redfish_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_2"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) + sum by (job) (1 * (label_replace(ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_2"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. diff --git a/pkg/collector/ipmi.go b/pkg/collector/ipmi.go index 38b40602..0c16072a 100644 --- a/pkg/collector/ipmi.go +++ b/pkg/collector/ipmi.go @@ -7,6 +7,7 @@ package collector // DCMI spec (old) https://www.intel.com/content/dam/www/public/us/en/documents/technical-specifications/dcmi-v1-5-rev-spec.pdf import ( + "cmp" "context" "encoding/json" "errors" @@ -15,6 +16,7 @@ import ( "os/exec" "path/filepath" "regexp" + "slices" "strconv" "strings" "time" @@ -25,11 +27,14 @@ import ( "github.com/prometheus/client_golang/prometheus" ) -const ipmiCollectorSubsystem = "ipmi_dcmi" +const ( + ipmiCollectorSubsystem = "ipmi" + ipmiDcmiLabel = "ipmi_dcmi_power" +) // Custom errors. var ( - ErrIPMIUnavailable = errors.New("IPMI Power readings not Active") + ErrIPMIUnavailable = errors.New("ipmi dcmi power readings are not active") ) // Execution modes. @@ -42,14 +47,16 @@ const ( ) type impiCollector struct { - logger *slog.Logger - hostname string - execMode string - ipmiCmd []string - client *ipmi.IPMIClient - securityContexts map[string]*security.SecurityContext - cachedMetric map[string]float64 - metricDesc map[string]*prometheus.Desc + logger *slog.Logger + hostname string + execMode string + ipmiCmd []string + client ipmi.Client + sensorRecords []*ipmi.FullSensorRecord + securityContexts map[string]*security.SecurityContext + cachedDCMIReadings map[string]float64 + cachedSensorReadings map[*ipmi.FullSensorRecord]float64 + metricDesc map[string]*prometheus.Desc } /* @@ -134,29 +141,35 @@ type impiCollector struct { */ var ( - ipmiDcmiCmdDepr = CEEMSExporterApp.Flag( - "collector.ipmi.dcmi.cmd", - "IPMI DCMI command to get system power statistics. Use full path to executables.", - ).Hidden().Default("").String() ipmiDcmiCmd = CEEMSExporterApp.Flag( - "collector.ipmi_dcmi.cmd", + "collector.ipmi.dcmi.cmd", "IPMI DCMI command to get system power statistics. Use full path to executables.", ).Default("").String() + ipmiPwrEnergySensors = CEEMSExporterApp.Flag( + "collector.ipmi.power-energy-sensor-readings", + "Enables collection of IPMI energy and/or power sensor readings. Sensors will be detected based on units. (default: disabled).", + ).Default("false").Bool() + ipmiMiscSensors = CEEMSExporterApp.Flag( + "collector.ipmi.sensor-id", + "Sensor IDs to monitor and export metrics.", + ).Uint8List() ipmiDevNum = CEEMSExporterApp.Flag( - "collector.ipmi_dcmi.dev-num", + "collector.ipmi.dev-num", "Device number used by OpenIPMI driver. For e.g. if device is found at /dev/ipmi0, device number is 0", ).Default("0").Int() forceNativeMode = CEEMSExporterApp.Flag( - "collector.ipmi_dcmi.force-native-mode", + "collector.ipmi.force-native-mode", "Force native mode using OpenIPMI driver.", ).Default("false").Bool() // test flags. Hidden. ipmiDcmiTestMode = CEEMSExporterApp.Flag( - "collector.ipmi_dcmi.test-mode", + "collector.ipmi.test-mode", "Enables IPMI DCMI collector in test mode. Only used in unit and e2e tests.", ).Default("false").Hidden().Bool() +) +var ( ipmiDcmiCmds = []string{ "ipmi-dcmi --get-system-power-statistics", "ipmitool dcmi power reading", @@ -194,9 +207,15 @@ const ( openIPMICtx = "open_ipmi" ) +type ipmiReadings struct { + dcmiPower map[string]float64 + sensors map[*ipmi.FullSensorRecord]float64 +} + type ipmiClientSecurityCtxData struct { - client *ipmi.IPMIClient - powerReadings map[string]float64 + client ipmi.Client + sensorRecords []*ipmi.FullSensorRecord + readings *ipmiReadings } func init() { @@ -205,32 +224,35 @@ func init() { // NewIPMICollector returns a new Collector exposing IMPI DCMI power metrics. func NewIPMICollector(logger *slog.Logger) (Collector, error) { - if *ipmiDcmiCmdDepr != "" { - logger.Warn("flag --collector.ipmi.dcmi.cmd has been deprecated. Use native mode by OpenIPMI driver using --collector.ipmi_dcmi.force-native-mode") + // Check if native mode is enabled when sensors are provided + if (*ipmiPwrEnergySensors || len(*ipmiMiscSensors) > 0) && !*forceNativeMode { + return nil, errors.New("fetching ipmi sensor readings is only supported when --collector.ipmi.force-native-mode is enabled") } var execMode string // Initialize metricDesc map - metricDesc := make(map[string]*prometheus.Desc, 4) - - cachedMetric := make(map[string]float64, 4) + metricDesc := make(map[string]*prometheus.Desc, 5) - metricDesc["current"] = prometheus.NewDesc( - prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "current_watts"), - "Current Power consumption in watts", []string{"hostname"}, nil, + metricDesc["dcmi_current"] = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, ipmiDcmiLabel, "current_watts"), + "Current power consumption reported by DCMI in watts", []string{"hostname"}, nil, ) - metricDesc["min"] = prometheus.NewDesc( - prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "min_watts"), - "Minimum Power consumption in watts", []string{"hostname"}, nil, + metricDesc["dcmi_min"] = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, ipmiDcmiLabel, "min_watts"), + "Minimum power consumption reported by DCMI in watts", []string{"hostname"}, nil, ) - metricDesc["max"] = prometheus.NewDesc( - prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "max_watts"), - "Maximum Power consumption in watts", []string{"hostname"}, nil, + metricDesc["dcmi_max"] = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, ipmiDcmiLabel, "max_watts"), + "Maximum power consumption reported by DCMI in watts", []string{"hostname"}, nil, ) - metricDesc["avg"] = prometheus.NewDesc( - prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "avg_watts"), - "Average Power consumption in watts", []string{"hostname"}, nil, + metricDesc["dcmi_avg"] = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, ipmiDcmiLabel, "avg_watts"), + "Average power consumption reported by DCMI in watts", []string{"hostname"}, nil, + ) + metricDesc["sensors"] = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, "ipmi_sensor_reading", "current"), + "Current reading of IPMI sensor", []string{"hostname", "sensorname", "sensorunits"}, nil, ) // If no IPMI command is provided, try to find one @@ -245,7 +267,7 @@ func NewIPMICollector(logger *slog.Logger) (Collector, error) { goto outside } - if *ipmiDcmiCmd == "" && *ipmiDcmiCmdDepr == "" { + if *ipmiDcmiCmd == "" { if cmdSlice, err = findIPMICmd(); err != nil { logger.Info("None of ipmitool,ipmiutil,ipmi-dcmi commands found. Using native implementation using OpenIPMI interface") @@ -254,11 +276,7 @@ func NewIPMICollector(logger *slog.Logger) (Collector, error) { goto outside } } else { - if *ipmiDcmiCmdDepr != "" { - cmdSlice = strings.Split(*ipmiDcmiCmdDepr, " ") - } else { - cmdSlice = strings.Split(*ipmiDcmiCmd, " ") - } + cmdSlice = strings.Split(*ipmiDcmiCmd, " ") } logger.Debug("Using IPMI command", "ipmi", strings.Join(cmdSlice, " ")) @@ -307,17 +325,25 @@ func NewIPMICollector(logger *slog.Logger) (Collector, error) { goto outside } + // By this point, if we still did not find a execMode return error + if execMode == "" { + logger.Error("Failed to execute IPMI commands. Ensure enough privileges are set on exporter process") + + return nil, errors.New("failed to execute ipmi commands due to lack of privileges") + } + outside: - logger.Debug("IPMI DCMI collector", "execution_mode", execMode) + logger.Debug("IPMI collector", "execution_mode", execMode) collector := impiCollector{ - logger: logger, - hostname: hostname, - execMode: execMode, - metricDesc: metricDesc, - cachedMetric: cachedMetric, - securityContexts: make(map[string]*security.SecurityContext), + logger: logger, + hostname: hostname, + execMode: execMode, + metricDesc: metricDesc, + cachedDCMIReadings: make(map[string]float64), + cachedSensorReadings: make(map[*ipmi.FullSensorRecord]float64), + securityContexts: make(map[string]*security.SecurityContext), } // Setup necessary capabilities. @@ -356,19 +382,68 @@ outside: logger.Warn("Failed to parse capability name(s)", "err", err) } + // IPMI config + ipmiConfig := &ipmi.Config{ + Logger: logger.With("subsystem", "ipmi_client"), + DevNum: *ipmiDevNum, + Timeout: time.Second, + } + // Setup IPMI client - collector.client, err = ipmi.NewIPMIClient(*ipmiDevNum, logger.With("subsystem", "ipmi_client")) + collector.client, err = ipmi.NewClient(ipmiConfig) if err != nil { logger.Error("Failed to create a IPMI client", "err", err) return nil, err } + if *ipmiPwrEnergySensors || len(*ipmiMiscSensors) > 0 { + // Get all sensor records + sensorRecords, err := collector.client.SensorRecords() + if err != nil { + logger.Error("Failed to get sensor records", "err", err) + + return nil, err + } + + // Filter records + for _, record := range sensorRecords { + // When power and energy sensors are requested, filter based on units + if *ipmiPwrEnergySensors { + if record.BaseUnit == ipmi.SensorUnitWatts || record.BaseUnit == ipmi.SensorUnitJoules { + collector.sensorRecords = append(collector.sensorRecords, record) + } + } + + // When sensor IDs are requested + if slices.Contains(*ipmiMiscSensors, record.Number) { + collector.sensorRecords = append(collector.sensorRecords, record) + } + } + + // Remove duplicates + slices.SortFunc(collector.sensorRecords, func(a, b *ipmi.FullSensorRecord) int { + return cmp.Compare(a.Number, b.Number) + }) + + collector.sensorRecords = slices.CompactFunc(collector.sensorRecords, func(a, b *ipmi.FullSensorRecord) bool { + return a.Number == b.Number + }) + + // Get sensor names + sensorNames := make([]string, len(collector.sensorRecords)) + for irecord, record := range collector.sensorRecords { + sensorNames[irecord] = record.Identity + } + + logger.Debug("Sensor to monitor", "sensors", strings.Join(sensorNames, ","), "num_sensors", len(collector.sensorRecords)) + } + // Setup security context cfg := &security.SCConfig{ Name: openIPMICtx, Caps: caps, - Func: dcmiPowerReading, + Func: doIPMIRequests, Logger: logger, ExecNatively: disableCapAwareness, } @@ -388,18 +463,27 @@ outside: // Update implements Collector and exposes IPMI DCMI power related metrics. func (c *impiCollector) Update(ch chan<- prometheus.Metric) error { - // Get power consumption from IPMI - powerReadings, err := c.update() + // Get IPMI readings + ipmiReadings, err := c.update() if err != nil { return ErrNoData } // Returned value 0 means Power Measurement is not avail - for rType, rValue := range powerReadings { + for rType, rValue := range ipmiReadings.dcmiPower { if rValue > 0 { ch <- prometheus.MustNewConstMetric(c.metricDesc[rType], prometheus.GaugeValue, float64(rValue), c.hostname) - c.cachedMetric[rType] = rValue + c.cachedDCMIReadings[rType] = rValue + } + } + + // Returned value 0 means sensor reading is not avail + for sType, sValue := range ipmiReadings.sensors { + if sValue > 0 { + ch <- prometheus.MustNewConstMetric(c.metricDesc["sensors"], prometheus.GaugeValue, float64(sValue), c.hostname, sType.Identity, sType.BaseUnit.String()) + + c.cachedSensorReadings[sType] = sValue } } @@ -423,40 +507,56 @@ func (c *impiCollector) Stop(_ context.Context) error { } // update returns current power readings or cached ones. -func (c *impiCollector) update() (map[string]float64, error) { - // Get power consumption from IPMI +func (c *impiCollector) update() (*ipmiReadings, error) { + // Get power consumption from DCMI and sensor readings // IPMI commands tend to fail frequently. If that happens we use last cached metric - powerReadings, err := c.getPowerReadings() + readings, err := c.getIPMIReadings() if err != nil { - // If there is no cached metric return - if len(c.cachedMetric) == 0 { - return nil, ErrNoData + // If there are no current and cached readings, return error + if readings == nil { + if len(c.cachedDCMIReadings) == 0 && len(c.cachedSensorReadings) == 0 { + return nil, ErrNoData + } + + c.logger.Error("Failed to get readings from IPMI. Using last cached values", "err", err) + + return &ipmiReadings{dcmiPower: c.cachedDCMIReadings, sensors: c.cachedSensorReadings}, nil + } + + // If DCMI readings are not available and cached readings are available set them + if len(readings.dcmiPower) == 0 && len(c.cachedDCMIReadings) > 0 { + c.logger.Error( + "Failed to get power statistics from IPMI DCMI. Using last cached value", + "err", err, "cached_value", c.cachedDCMIReadings["dcmi_current"], + ) + + readings.dcmiPower = c.cachedDCMIReadings } - c.logger.Error( - "Failed to get power statistics from IPMI. Using last cached values", - "err", err, "cached_metrics", fmt.Sprintf("%#v", c.cachedMetric), - ) + // If sensor readings are not available and cached readings are available set them + if len(c.sensorRecords) > 0 && len(readings.sensors) == 0 && len(c.cachedSensorReadings) > 0 { + c.logger.Error("Failed to get readings from IPMI sensors. Using last cached values", "err", err) - powerReadings = c.cachedMetric + readings.sensors = c.cachedSensorReadings + } } else { // Ensure powerReadings are non nil // Check only current usage which is more important - if currentUsage, ok := powerReadings["current"]; !ok || currentUsage == 0 { + if currentUsage, ok := readings.dcmiPower["dcmi_current"]; !ok || currentUsage == 0 { c.logger.Error( - "IPMI returned null values. Using last cached values", - "err", err, "cached_metrics", fmt.Sprintf("%#v", c.cachedMetric), + "IPMI DCMI returned null values. Using last cached value", + "err", err, "cached_value", c.cachedDCMIReadings["dcmi_current"], ) - powerReadings = c.cachedMetric + readings.dcmiPower = c.cachedDCMIReadings } } - return powerReadings, nil + return readings, nil } -// Get current, min and max power readings. -func (c *impiCollector) getPowerReadings() (map[string]float64, error) { +// Get current, min and max DCMI power and sensor readings. +func (c *impiCollector) getIPMIReadings() (*ipmiReadings, error) { // If mode is native, make request in security context if c.execMode == nativeMode { return c.doRequestInSecurityContext() @@ -480,7 +580,7 @@ func (c *impiCollector) getPowerReadings() (map[string]float64, error) { return nil, err } - return values, nil + return &ipmiReadings{dcmiPower: values}, nil } // Parse current, min and max power readings for capmc output. @@ -504,14 +604,14 @@ func (c *impiCollector) parseCapmcOutput(stdOut []byte) (map[string]float64, err for rType := range ipmiDCMIPowerReadingRegexMap { if value, ok := data[rType]; ok { if valueFloat, valueOk := value.(float64); valueOk { - powerReadings[rType] = valueFloat + powerReadings["dcmi_"+rType] = valueFloat } } } // capmc does not return current power. So we use avg as proxy for current - if powerReadings["avg"] > 0 { - powerReadings["current"] = powerReadings["avg"] + if powerReadings["dcmi_avg"] > 0 { + powerReadings["dcmi_current"] = powerReadings["dcmi_avg"] } return powerReadings, nil @@ -533,7 +633,7 @@ func (c *impiCollector) parseIPMIOutput(stdOut []byte) (map[string]float64, erro for rType, regex := range ipmiDCMIPowerReadingRegexMap { if reading, err := getValue(stdOut, regex); err == nil { if readingValue, err := strconv.ParseFloat(reading, 64); err == nil { - powerReadings[rType] = readingValue + powerReadings["dcmi_"+rType] = readingValue } } } @@ -595,25 +695,28 @@ func (c *impiCollector) executeCmdInSecurityContext() ([]byte, error) { } // doRequestInSecurityContext makes requests to IPMI device interface within a security context. -func (c *impiCollector) doRequestInSecurityContext() (map[string]float64, error) { +func (c *impiCollector) doRequestInSecurityContext() (*ipmiReadings, error) { // Execute command as root dataPtr := &ipmiClientSecurityCtxData{ - client: c.client, + client: c.client, + sensorRecords: c.sensorRecords, } // Read stdOut of command into data if securityCtx, ok := c.securityContexts[openIPMICtx]; ok { + // Always return readings as we might have partial result + // in readings if err := securityCtx.Exec(dataPtr); err != nil { - return nil, err + return dataPtr.readings, err } } else { return nil, security.ErrNoSecurityCtx } - return dataPtr.powerReadings, nil + return dataPtr.readings, nil } -func dcmiPowerReading(data any) error { +func doIPMIRequests(data any) error { // Assert data var d *ipmiClientSecurityCtxData @@ -622,20 +725,32 @@ func dcmiPowerReading(data any) error { return security.ErrSecurityCtxDataAssertion } + // Initialize readings + d.readings = &ipmiReadings{} + // Get current power reading from DCMI - reading, err := d.client.PowerReading(time.Second) + dcmiReading, err := d.client.DCMIPowerReading() if err != nil { return err } - // Read power reading into dataPointer - d.powerReadings = map[string]float64{ - "min": float64(reading.Minimum), - "max": float64(reading.Maximum), - "avg": float64(reading.Average), - "current": float64(reading.Current), + // Read power readings into dataPointer + d.readings.dcmiPower = map[string]float64{ + "dcmi_min": float64(dcmiReading.Minimum), + "dcmi_max": float64(dcmiReading.Maximum), + "dcmi_avg": float64(dcmiReading.Average), + "dcmi_current": float64(dcmiReading.Current), + } + + // Get sensor readings + sensorReadings, err := d.client.SensorReadings(d.sensorRecords) + if err != nil { + return err } + // Read sensor readings into dataPointer + d.readings.sensors = sensorReadings + return nil } diff --git a/pkg/collector/ipmi_test.go b/pkg/collector/ipmi_test.go index f331dba4..6d4c01bd 100644 --- a/pkg/collector/ipmi_test.go +++ b/pkg/collector/ipmi_test.go @@ -4,11 +4,14 @@ package collector import ( + "errors" "fmt" "os" "path/filepath" "testing" + "github.com/ceems-dev/ceems/internal/security" + "github.com/ceems-dev/ceems/pkg/ipmi" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -80,23 +83,82 @@ ipmiutil dcmi, completed successfully crayPowerCap: `{"e":1,"err_msg":"failed"}`, } expectedPower = map[string]float64{ - "current": 332, - "min": 68, - "max": 504, - "avg": 348, + "dcmi_current": 332, + "dcmi_min": 68, + "dcmi_max": 504, + "dcmi_avg": 348, } expectedCapmcPower = map[string]float64{ - "current": 348, - "min": 68, - "max": 504, - "avg": 348, + "dcmi_current": 348, + "dcmi_min": 68, + "dcmi_max": 504, + "dcmi_avg": 348, + } + testSensorRecords = []*ipmi.FullSensorRecord{ + {Identity: "Sensor 1"}, + {Identity: "Sensor 2"}, + } + expectedSensorReading = map[*ipmi.FullSensorRecord]float64{ + testSensorRecords[0]: 123, + testSensorRecords[1]: 223, } ) +type mockIPMIClient struct { + dcmiCounter, sensorCounter int +} + +func newMockIPMIClient() ipmi.Client { + return &mockIPMIClient{} +} + +func (c *mockIPMIClient) Close() error { + return nil +} + +func (c *mockIPMIClient) Do(r *ipmi.Request) (*ipmi.Response, error) { + return nil, nil //nolint:nilnil +} + +func (c *mockIPMIClient) DCMIPowerReading() (*ipmi.PowerReading, error) { + if c.dcmiCounter == 2 { + return nil, errors.New("some error") + } + + c.dcmiCounter++ + + return &ipmi.PowerReading{ + Minimum: expectedPower["dcmi_min"], + Maximum: expectedPower["dcmi_max"], + Current: expectedPower["dcmi_current"], + Average: expectedPower["dcmi_avg"], + }, nil +} + +func (c *mockIPMIClient) LanIP() (*string, error) { + ip := "10.0.0.1" + + return &ip, nil +} + +func (c *mockIPMIClient) SensorRecords() ([]*ipmi.FullSensorRecord, error) { + return testSensorRecords, nil +} + +func (c *mockIPMIClient) SensorReadings(records []*ipmi.FullSensorRecord) (map[*ipmi.FullSensorRecord]float64, error) { + if c.sensorCounter == 1 { + return nil, errors.New("some error") + } + + c.sensorCounter++ + + return expectedSensorReading, nil +} + func TestIPMICollector(t *testing.T) { _, err := CEEMSExporterApp.Parse([]string{ - "--collector.ipmi_dcmi.cmd", "testdata/ipmi/capmc/capmc", - "--collector.ipmi_dcmi.test-mode", + "--collector.ipmi.dcmi.cmd", "testdata/ipmi/capmc/capmc", + "--collector.ipmi.test-mode", }) require.NoError(t, err) @@ -138,7 +200,7 @@ func TestIpmiMetrics(t *testing.T) { } require.NoError(t, err) - assert.Equal(t, expectedOutput, value) + assert.Equal(t, expectedOutput, value, testName) } } @@ -153,7 +215,7 @@ func TestIpmiMetricsDisactive(t *testing.T) { value, _ = c.parseIPMIOutput([]byte(testString)) } - assert.Empty(t, value) + assert.Empty(t, value, testName) } } @@ -187,8 +249,8 @@ func TestIpmiClientFinder(t *testing.T) { t.Setenv("PATH", fmt.Sprintf("%s:%s", ipmiClientPath, basePath)) ipmiClientSlice, err := findIPMICmd() - require.NoError(t, err) - assert.Equal(t, test.name, ipmiClientSlice[0]) + require.NoError(t, err, test.name) + assert.Equal(t, test.name, ipmiClientSlice[0], test.name) } } @@ -199,7 +261,43 @@ func TestCachedPowerReadings(t *testing.T) { // Set path t.Setenv("PATH", fmt.Sprintf("%s:%s", tmpDir, os.Getenv("PATH"))) + // Expected values + expected := map[string]float64{"dcmi_avg": 49, "dcmi_current": 304, "dcmi_max": 304, "dcmi_min": 6} + + // When collector is being instantiated d1 := []byte(`#!/bin/bash +exit 1`) + err := os.WriteFile(tmpIPMIPath, d1, 0o700) //nolint:gosec + require.NoError(t, err) + + _, err = CEEMSExporterApp.Parse([]string{ + "--collector.ipmi.dcmi.cmd", tmpIPMIPath, + "--collector.ipmi.test-mode", + }) + require.NoError(t, err) + + collector, err := NewIPMICollector(noOpLogger) + require.NoError(t, err) + + c := collector.(*impiCollector) //nolint:forcetypeassert + + // Setup background goroutine to capture metrics. + metrics := make(chan prometheus.Metric) + defer close(metrics) + + go func() { + i := 0 + for range metrics { + i++ + } + }() + + // Get readings + err = collector.Update(metrics) + require.Error(t, err, "first scrape should result in error") + + // Now command should pass + d1 = []byte(`#!/bin/bash echo """ipmiutil dcmi ver 3.17 -- BMC version 6.10, IPMI version 2.0 @@ -221,39 +319,26 @@ DCMI Secondary LAN Channel: Supported Correction Time: 62914560 ms Sampling period: 1472 sec ipmiutil dcmi, completed successfully"""`) - err := os.WriteFile(tmpIPMIPath, d1, 0o700) //nolint:gosec - require.NoError(t, err) - - // Expected values - expected := map[string]float64{"avg": 49, "current": 304, "max": 304, "min": 6} - - _, err = CEEMSExporterApp.Parse([]string{ - "--collector.ipmi_dcmi.cmd", tmpIPMIPath, - "--collector.ipmi_dcmi.test-mode", - }) + err = os.WriteFile(tmpIPMIPath, d1, 0o700) //nolint:gosec require.NoError(t, err) - collector, err := NewIPMICollector(noOpLogger) + // Get readings + err = collector.Update(metrics) require.NoError(t, err) - c := collector.(*impiCollector) //nolint:forcetypeassert + assert.Equal(t, expected, c.cachedDCMIReadings) - // Setup background goroutine to capture metrics. - metrics := make(chan prometheus.Metric) - defer close(metrics) - - go func() { - i := 0 - for range metrics { - i++ - } - }() + // Modify script again to return error + d1 = []byte(`#!/bin/bash +exit 1`) + err = os.WriteFile(tmpIPMIPath, d1, 0o700) //nolint:gosec + require.NoError(t, err) // Get readings - err = collector.Update(metrics) + got, err := c.update() require.NoError(t, err) - assert.Equal(t, expected, c.cachedMetric) + assert.Equal(t, expected, got.dcmiPower) // Modify IPMI command to give 0 current usage d1 = []byte(`#!/bin/bash @@ -282,8 +367,72 @@ ipmiutil dcmi, completed successfully"""`) require.NoError(t, err) // Get readings again and we should get last cached values + got, err = c.update() + require.NoError(t, err) + + assert.Equal(t, expected, got.dcmiPower) +} + +func TestIpmiNativeMode(t *testing.T) { + _, err := CEEMSExporterApp.Parse([]string{ + "--collector.ipmi.dcmi.cmd", "testdata/ipmi/capmc/capmc", + "--collector.ipmi.test-mode", + }) + require.NoError(t, err) + + collector, err := NewIPMICollector(noOpLogger) + require.NoError(t, err) + + c := collector.(*impiCollector) //nolint:forcetypeassert + + // Set native mode + c.execMode = nativeMode + c.client = newMockIPMIClient() + c.sensorRecords = testSensorRecords + + // Setup security context + cfg := &security.SCConfig{ + Name: openIPMICtx, + Logger: noOpLogger, + Func: doIPMIRequests, + ExecNatively: true, + } + secuCtx, err := security.NewSecurityContext(cfg) + require.NoError(t, err) + + c.securityContexts[openIPMICtx] = secuCtx + + // Setup background goroutine to capture metrics. + metrics := make(chan prometheus.Metric) + defer close(metrics) + + go func() { + i := 0 + for range metrics { + i++ + } + }() + + // Make first scrape and should get expected values + err = c.Update(metrics) + require.NoError(t, err) + + assert.Equal(t, expectedPower, c.cachedDCMIReadings) + assert.Equal(t, expectedSensorReading, c.cachedSensorReadings) + + // Make second scrape where sensors should fail but should get from + // cached got, err := c.update() require.NoError(t, err) - assert.Equal(t, expected, got) + assert.Equal(t, expectedSensorReading, got.sensors) + assert.Equal(t, expectedSensorReading, c.cachedSensorReadings) + + // Make third scrape where DCMI should fail but should get from + // cached + got, err = c.update() + require.NoError(t, err) + + assert.Equal(t, expectedPower, got.dcmiPower) + assert.Equal(t, expectedPower, c.cachedDCMIReadings) } diff --git a/pkg/collector/redfish.go b/pkg/collector/redfish.go index f6a88235..46f31b1f 100644 --- a/pkg/collector/redfish.go +++ b/pkg/collector/redfish.go @@ -82,14 +82,14 @@ func (c *redfishClientConfig) UnmarshalYAML(unmarshal func(any) error) error { // If BMC Hostname is not provided, attempt to discover it using OpenIPMI interface if c.Hostname == "" { // Make a new IPMI client - client, err := ipmi.NewIPMIClient(0, slog.New(slog.DiscardHandler)) + client, err := ipmi.NewClient(&ipmi.Config{DevNum: 0, Logger: slog.New(slog.DiscardHandler), Timeout: time.Second}) if err != nil { return fmt.Errorf("failed to create IPMI client to get BMC address: %w", err) } defer client.Close() // Attempt to get new IP address - bmcIP, err := client.LanIP(time.Second) + bmcIP, err := client.LanIP() if err != nil { return fmt.Errorf("failed to get BMC LAN IP: %w", err) } @@ -181,19 +181,19 @@ func NewRedfishCollector(logger *slog.Logger) (Collector, error) { // Initialize metricDesc map metricDesc := map[string]*prometheus.Desc{ "current": prometheus.NewDesc( - prometheus.BuildFQName(Namespace, redfishCollectorSubsystem, "current_watts"), + prometheus.BuildFQName(Namespace, redfishCollectorSubsystem, "power_current_watts"), "Current Power consumption in watts", []string{"hostname", "chassis"}, nil, ), "min": prometheus.NewDesc( - prometheus.BuildFQName(Namespace, redfishCollectorSubsystem, "min_watts"), + prometheus.BuildFQName(Namespace, redfishCollectorSubsystem, "power_min_watts"), "Minimum Power consumption in watts", []string{"hostname", "chassis"}, nil, ), "max": prometheus.NewDesc( - prometheus.BuildFQName(Namespace, redfishCollectorSubsystem, "max_watts"), + prometheus.BuildFQName(Namespace, redfishCollectorSubsystem, "power_max_watts"), "Maximum Power consumption in watts", []string{"hostname", "chassis"}, nil, ), "avg": prometheus.NewDesc( - prometheus.BuildFQName(Namespace, redfishCollectorSubsystem, "avg_watts"), + prometheus.BuildFQName(Namespace, redfishCollectorSubsystem, "power_avg_watts"), "Average Power consumption in watts", []string{"hostname", "chassis"}, nil, ), } diff --git a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-k8s-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-k8s-output.txt index 020f66b9..c8b8b179 100644 --- a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-k8s-output.txt +++ b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-k8s-output.txt @@ -177,18 +177,18 @@ ceems_cpu_seconds_total{hostname="",mode="system"} 1119.22 ceems_cpu_seconds_total{hostname="",mode="user"} 3018.54 # HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. # TYPE ceems_exporter_build_info gauge -# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts -# TYPE ceems_ipmi_dcmi_avg_watts gauge -ceems_ipmi_dcmi_avg_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts -# TYPE ceems_ipmi_dcmi_current_watts gauge -ceems_ipmi_dcmi_current_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts -# TYPE ceems_ipmi_dcmi_max_watts gauge -ceems_ipmi_dcmi_max_watts{hostname=""} 6132 -# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts -# TYPE ceems_ipmi_dcmi_min_watts gauge -ceems_ipmi_dcmi_min_watts{hostname=""} 5748 +# HELP ceems_ipmi_dcmi_power_avg_watts Average power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_avg_watts gauge +ceems_ipmi_dcmi_power_avg_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_current_watts Current power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_current_watts gauge +ceems_ipmi_dcmi_power_current_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_max_watts Maximum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_max_watts gauge +ceems_ipmi_dcmi_power_max_watts{hostname=""} 6132 +# HELP ceems_ipmi_dcmi_power_min_watts Minimum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_min_watts gauge +ceems_ipmi_dcmi_power_min_watts{hostname=""} 5748 # HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. # TYPE ceems_meminfo_MemAvailable_bytes gauge ceems_meminfo_MemAvailable_bytes{hostname=""} 0 @@ -219,7 +219,7 @@ ceems_rapl_package_power_limit_watts_total{hostname="",index="1",path="pkg/colle # HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 -ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="ipmi"} 1 ceems_scrape_collector_success{collector="k8s"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 diff --git a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-libvirt-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-libvirt-output.txt index d8eabbd3..46703e17 100644 --- a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-libvirt-output.txt +++ b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-libvirt-output.txt @@ -139,18 +139,18 @@ ceems_cpu_seconds_total{hostname="",mode="system"} 1119.22 ceems_cpu_seconds_total{hostname="",mode="user"} 3018.54 # HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. # TYPE ceems_exporter_build_info gauge -# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts -# TYPE ceems_ipmi_dcmi_avg_watts gauge -ceems_ipmi_dcmi_avg_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts -# TYPE ceems_ipmi_dcmi_current_watts gauge -ceems_ipmi_dcmi_current_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts -# TYPE ceems_ipmi_dcmi_max_watts gauge -ceems_ipmi_dcmi_max_watts{hostname=""} 6132 -# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts -# TYPE ceems_ipmi_dcmi_min_watts gauge -ceems_ipmi_dcmi_min_watts{hostname=""} 5748 +# HELP ceems_ipmi_dcmi_power_avg_watts Average power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_avg_watts gauge +ceems_ipmi_dcmi_power_avg_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_current_watts Current power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_current_watts gauge +ceems_ipmi_dcmi_power_current_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_max_watts Maximum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_max_watts gauge +ceems_ipmi_dcmi_power_max_watts{hostname=""} 6132 +# HELP ceems_ipmi_dcmi_power_min_watts Minimum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_min_watts gauge +ceems_ipmi_dcmi_power_min_watts{hostname=""} 5748 # HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. # TYPE ceems_meminfo_MemAvailable_bytes gauge ceems_meminfo_MemAvailable_bytes{hostname=""} 0 @@ -181,7 +181,7 @@ ceems_rapl_package_power_limit_watts_total{hostname="",index="1",path="pkg/colle # HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 -ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="ipmi"} 1 ceems_scrape_collector_success{collector="libvirt"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 diff --git a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-memory-subsystem-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-memory-subsystem-output.txt index 933f062a..82df994c 100644 --- a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-memory-subsystem-output.txt +++ b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-memory-subsystem-output.txt @@ -155,18 +155,18 @@ ceems_cpu_seconds_total{hostname="",mode="system"} 1119.22 ceems_cpu_seconds_total{hostname="",mode="user"} 3018.54 # HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. # TYPE ceems_exporter_build_info gauge -# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts -# TYPE ceems_ipmi_dcmi_avg_watts gauge -ceems_ipmi_dcmi_avg_watts{hostname=""} 348 -# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts -# TYPE ceems_ipmi_dcmi_current_watts gauge -ceems_ipmi_dcmi_current_watts{hostname=""} 332 -# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts -# TYPE ceems_ipmi_dcmi_max_watts gauge -ceems_ipmi_dcmi_max_watts{hostname=""} 504 -# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts -# TYPE ceems_ipmi_dcmi_min_watts gauge -ceems_ipmi_dcmi_min_watts{hostname=""} 68 +# HELP ceems_ipmi_dcmi_power_avg_watts Average power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_avg_watts gauge +ceems_ipmi_dcmi_power_avg_watts{hostname=""} 348 +# HELP ceems_ipmi_dcmi_power_current_watts Current power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_current_watts gauge +ceems_ipmi_dcmi_power_current_watts{hostname=""} 332 +# HELP ceems_ipmi_dcmi_power_max_watts Maximum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_max_watts gauge +ceems_ipmi_dcmi_power_max_watts{hostname=""} 504 +# HELP ceems_ipmi_dcmi_power_min_watts Minimum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_min_watts gauge +ceems_ipmi_dcmi_power_min_watts{hostname=""} 68 # HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. # TYPE ceems_meminfo_MemAvailable_bytes gauge ceems_meminfo_MemAvailable_bytes{hostname=""} 0 @@ -197,7 +197,7 @@ ceems_rapl_package_power_limit_watts_total{hostname="",index="1",path="pkg/colle # HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 -ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="ipmi"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 ceems_scrape_collector_success{collector="slurm"} 1 diff --git a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-output.txt index bccea5bc..5a6847fb 100644 --- a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-output.txt +++ b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-output.txt @@ -176,18 +176,18 @@ ceems_cray_pm_counters_power_watts{domain="node",hostname=""} 873 ceems_cray_pm_counters_temp_celsius{domain="cpu0",hostname=""} 48 # HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. # TYPE ceems_exporter_build_info gauge -# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts -# TYPE ceems_ipmi_dcmi_avg_watts gauge -ceems_ipmi_dcmi_avg_watts{hostname=""} 348 -# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts -# TYPE ceems_ipmi_dcmi_current_watts gauge -ceems_ipmi_dcmi_current_watts{hostname=""} 332 -# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts -# TYPE ceems_ipmi_dcmi_max_watts gauge -ceems_ipmi_dcmi_max_watts{hostname=""} 504 -# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts -# TYPE ceems_ipmi_dcmi_min_watts gauge -ceems_ipmi_dcmi_min_watts{hostname=""} 68 +# HELP ceems_ipmi_dcmi_power_avg_watts Average power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_avg_watts gauge +ceems_ipmi_dcmi_power_avg_watts{hostname=""} 348 +# HELP ceems_ipmi_dcmi_power_current_watts Current power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_current_watts gauge +ceems_ipmi_dcmi_power_current_watts{hostname=""} 332 +# HELP ceems_ipmi_dcmi_power_max_watts Maximum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_max_watts gauge +ceems_ipmi_dcmi_power_max_watts{hostname=""} 504 +# HELP ceems_ipmi_dcmi_power_min_watts Minimum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_min_watts gauge +ceems_ipmi_dcmi_power_min_watts{hostname=""} 68 # HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. # TYPE ceems_meminfo_MemAvailable_bytes gauge ceems_meminfo_MemAvailable_bytes{hostname=""} 0 @@ -213,29 +213,29 @@ ceems_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/testda # TYPE ceems_rapl_package_power_limit_watts_total counter ceems_rapl_package_power_limit_watts_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0"} 180 ceems_rapl_package_power_limit_watts_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1"} 180 -# HELP ceems_redfish_avg_watts Average Power consumption in watts -# TYPE ceems_redfish_avg_watts gauge -ceems_redfish_avg_watts{chassis="Chassis_1",hostname=""} 365 -ceems_redfish_avg_watts{chassis="Chassis_2",hostname=""} 1734 -# HELP ceems_redfish_current_watts Current Power consumption in watts -# TYPE ceems_redfish_current_watts gauge -ceems_redfish_current_watts{chassis="Chassis_1",hostname=""} 397 -ceems_redfish_current_watts{chassis="Chassis_2",hostname=""} 1696 -# HELP ceems_redfish_max_watts Maximum Power consumption in watts -# TYPE ceems_redfish_max_watts gauge -ceems_redfish_max_watts{chassis="Chassis_1",hostname=""} 609 -ceems_redfish_max_watts{chassis="Chassis_2",hostname=""} 2155 -# HELP ceems_redfish_min_watts Minimum Power consumption in watts -# TYPE ceems_redfish_min_watts gauge -ceems_redfish_min_watts{chassis="Chassis_1",hostname=""} 326 -ceems_redfish_min_watts{chassis="Chassis_2",hostname=""} 588 +# HELP ceems_redfish_power_avg_watts Average Power consumption in watts +# TYPE ceems_redfish_power_avg_watts gauge +ceems_redfish_power_avg_watts{chassis="Chassis_1",hostname=""} 365 +ceems_redfish_power_avg_watts{chassis="Chassis_2",hostname=""} 1734 +# HELP ceems_redfish_power_current_watts Current Power consumption in watts +# TYPE ceems_redfish_power_current_watts gauge +ceems_redfish_power_current_watts{chassis="Chassis_1",hostname=""} 397 +ceems_redfish_power_current_watts{chassis="Chassis_2",hostname=""} 1696 +# HELP ceems_redfish_power_max_watts Maximum Power consumption in watts +# TYPE ceems_redfish_power_max_watts gauge +ceems_redfish_power_max_watts{chassis="Chassis_1",hostname=""} 609 +ceems_redfish_power_max_watts{chassis="Chassis_2",hostname=""} 2155 +# HELP ceems_redfish_power_min_watts Minimum Power consumption in watts +# TYPE ceems_redfish_power_min_watts gauge +ceems_redfish_power_min_watts{chassis="Chassis_1",hostname=""} 326 +ceems_redfish_power_min_watts{chassis="Chassis_2",hostname=""} 588 # HELP ceems_scrape_collector_duration_seconds ceems_exporter: Duration of a collector scrape. # TYPE ceems_scrape_collector_duration_seconds gauge # HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 ceems_scrape_collector_success{collector="cray_pm_counters"} 1 -ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="ipmi"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 ceems_scrape_collector_success{collector="redfish"} 1 diff --git a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-all-metrics-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-all-metrics-output.txt index c56bc49d..a8844f15 100644 --- a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-all-metrics-output.txt +++ b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-all-metrics-output.txt @@ -231,18 +231,18 @@ ceems_cray_pm_counters_power_watts{domain="node",hostname=""} 873 ceems_cray_pm_counters_temp_celsius{domain="cpu0",hostname=""} 48 # HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. # TYPE ceems_exporter_build_info gauge -# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts -# TYPE ceems_ipmi_dcmi_avg_watts gauge -ceems_ipmi_dcmi_avg_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts -# TYPE ceems_ipmi_dcmi_current_watts gauge -ceems_ipmi_dcmi_current_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts -# TYPE ceems_ipmi_dcmi_max_watts gauge -ceems_ipmi_dcmi_max_watts{hostname=""} 6132 -# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts -# TYPE ceems_ipmi_dcmi_min_watts gauge -ceems_ipmi_dcmi_min_watts{hostname=""} 5748 +# HELP ceems_ipmi_dcmi_power_avg_watts Average power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_avg_watts gauge +ceems_ipmi_dcmi_power_avg_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_current_watts Current power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_current_watts gauge +ceems_ipmi_dcmi_power_current_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_max_watts Maximum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_max_watts gauge +ceems_ipmi_dcmi_power_max_watts{hostname=""} 6132 +# HELP ceems_ipmi_dcmi_power_min_watts Minimum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_min_watts gauge +ceems_ipmi_dcmi_power_min_watts{hostname=""} 5748 # HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. # TYPE ceems_meminfo_MemAvailable_bytes gauge ceems_meminfo_MemAvailable_bytes{hostname=""} 0 @@ -268,29 +268,29 @@ ceems_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/testda # TYPE ceems_rapl_package_power_limit_watts_total counter ceems_rapl_package_power_limit_watts_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0"} 180 ceems_rapl_package_power_limit_watts_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1"} 180 -# HELP ceems_redfish_avg_watts Average Power consumption in watts -# TYPE ceems_redfish_avg_watts gauge -ceems_redfish_avg_watts{chassis="Chassis_1",hostname=""} 365 -ceems_redfish_avg_watts{chassis="Chassis_2",hostname=""} 1734 -# HELP ceems_redfish_current_watts Current Power consumption in watts -# TYPE ceems_redfish_current_watts gauge -ceems_redfish_current_watts{chassis="Chassis_1",hostname=""} 397 -ceems_redfish_current_watts{chassis="Chassis_2",hostname=""} 1696 -# HELP ceems_redfish_max_watts Maximum Power consumption in watts -# TYPE ceems_redfish_max_watts gauge -ceems_redfish_max_watts{chassis="Chassis_1",hostname=""} 609 -ceems_redfish_max_watts{chassis="Chassis_2",hostname=""} 2155 -# HELP ceems_redfish_min_watts Minimum Power consumption in watts -# TYPE ceems_redfish_min_watts gauge -ceems_redfish_min_watts{chassis="Chassis_1",hostname=""} 326 -ceems_redfish_min_watts{chassis="Chassis_2",hostname=""} 588 +# HELP ceems_redfish_power_avg_watts Average Power consumption in watts +# TYPE ceems_redfish_power_avg_watts gauge +ceems_redfish_power_avg_watts{chassis="Chassis_1",hostname=""} 365 +ceems_redfish_power_avg_watts{chassis="Chassis_2",hostname=""} 1734 +# HELP ceems_redfish_power_current_watts Current Power consumption in watts +# TYPE ceems_redfish_power_current_watts gauge +ceems_redfish_power_current_watts{chassis="Chassis_1",hostname=""} 397 +ceems_redfish_power_current_watts{chassis="Chassis_2",hostname=""} 1696 +# HELP ceems_redfish_power_max_watts Maximum Power consumption in watts +# TYPE ceems_redfish_power_max_watts gauge +ceems_redfish_power_max_watts{chassis="Chassis_1",hostname=""} 609 +ceems_redfish_power_max_watts{chassis="Chassis_2",hostname=""} 2155 +# HELP ceems_redfish_power_min_watts Minimum Power consumption in watts +# TYPE ceems_redfish_power_min_watts gauge +ceems_redfish_power_min_watts{chassis="Chassis_1",hostname=""} 326 +ceems_redfish_power_min_watts{chassis="Chassis_2",hostname=""} 588 # HELP ceems_scrape_collector_duration_seconds ceems_exporter: Duration of a collector scrape. # TYPE ceems_scrape_collector_duration_seconds gauge # HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 ceems_scrape_collector_success{collector="cray_pm_counters"} 1 -ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="ipmi"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 ceems_scrape_collector_success{collector="redfish"} 1 diff --git a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-amd-ipmitool-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-amd-ipmitool-output.txt index e08df076..19888b2c 100644 --- a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-amd-ipmitool-output.txt +++ b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-amd-ipmitool-output.txt @@ -175,18 +175,18 @@ ceems_hwmon_power_max_watts{chip="socket",chip_name="socket",hostname="",sensor= # TYPE ceems_hwmon_power_min_watts gauge ceems_hwmon_power_min_watts{chip="socket",chip_name="socket",hostname="",sensor="power1"} 65 ceems_hwmon_power_min_watts{chip="socket",chip_name="socket",hostname="",sensor="power2"} 46 -# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts -# TYPE ceems_ipmi_dcmi_avg_watts gauge -ceems_ipmi_dcmi_avg_watts{hostname=""} 2567 -# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts -# TYPE ceems_ipmi_dcmi_current_watts gauge -ceems_ipmi_dcmi_current_watts{hostname=""} 2578 -# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts -# TYPE ceems_ipmi_dcmi_max_watts gauge -ceems_ipmi_dcmi_max_watts{hostname=""} 4294 -# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts -# TYPE ceems_ipmi_dcmi_min_watts gauge -ceems_ipmi_dcmi_min_watts{hostname=""} 2118 +# HELP ceems_ipmi_dcmi_power_avg_watts Average power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_avg_watts gauge +ceems_ipmi_dcmi_power_avg_watts{hostname=""} 2567 +# HELP ceems_ipmi_dcmi_power_current_watts Current power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_current_watts gauge +ceems_ipmi_dcmi_power_current_watts{hostname=""} 2578 +# HELP ceems_ipmi_dcmi_power_max_watts Maximum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_max_watts gauge +ceems_ipmi_dcmi_power_max_watts{hostname=""} 4294 +# HELP ceems_ipmi_dcmi_power_min_watts Minimum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_min_watts gauge +ceems_ipmi_dcmi_power_min_watts{hostname=""} 2118 # HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. # TYPE ceems_meminfo_MemAvailable_bytes gauge ceems_meminfo_MemAvailable_bytes{hostname=""} 0 @@ -212,29 +212,29 @@ ceems_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/testda # TYPE ceems_rapl_package_power_limit_watts_total counter ceems_rapl_package_power_limit_watts_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0"} 180 ceems_rapl_package_power_limit_watts_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1"} 180 -# HELP ceems_redfish_avg_watts Average Power consumption in watts -# TYPE ceems_redfish_avg_watts gauge -ceems_redfish_avg_watts{chassis="Chassis_1",hostname=""} 365 -ceems_redfish_avg_watts{chassis="Chassis_2",hostname=""} 1734 -# HELP ceems_redfish_current_watts Current Power consumption in watts -# TYPE ceems_redfish_current_watts gauge -ceems_redfish_current_watts{chassis="Chassis_1",hostname=""} 397 -ceems_redfish_current_watts{chassis="Chassis_2",hostname=""} 1696 -# HELP ceems_redfish_max_watts Maximum Power consumption in watts -# TYPE ceems_redfish_max_watts gauge -ceems_redfish_max_watts{chassis="Chassis_1",hostname=""} 609 -ceems_redfish_max_watts{chassis="Chassis_2",hostname=""} 2155 -# HELP ceems_redfish_min_watts Minimum Power consumption in watts -# TYPE ceems_redfish_min_watts gauge -ceems_redfish_min_watts{chassis="Chassis_1",hostname=""} 326 -ceems_redfish_min_watts{chassis="Chassis_2",hostname=""} 588 +# HELP ceems_redfish_power_avg_watts Average Power consumption in watts +# TYPE ceems_redfish_power_avg_watts gauge +ceems_redfish_power_avg_watts{chassis="Chassis_1",hostname=""} 365 +ceems_redfish_power_avg_watts{chassis="Chassis_2",hostname=""} 1734 +# HELP ceems_redfish_power_current_watts Current Power consumption in watts +# TYPE ceems_redfish_power_current_watts gauge +ceems_redfish_power_current_watts{chassis="Chassis_1",hostname=""} 397 +ceems_redfish_power_current_watts{chassis="Chassis_2",hostname=""} 1696 +# HELP ceems_redfish_power_max_watts Maximum Power consumption in watts +# TYPE ceems_redfish_power_max_watts gauge +ceems_redfish_power_max_watts{chassis="Chassis_1",hostname=""} 609 +ceems_redfish_power_max_watts{chassis="Chassis_2",hostname=""} 2155 +# HELP ceems_redfish_power_min_watts Minimum Power consumption in watts +# TYPE ceems_redfish_power_min_watts gauge +ceems_redfish_power_min_watts{chassis="Chassis_1",hostname=""} 326 +ceems_redfish_power_min_watts{chassis="Chassis_2",hostname=""} 588 # HELP ceems_scrape_collector_duration_seconds ceems_exporter: Duration of a collector scrape. # TYPE ceems_scrape_collector_duration_seconds gauge # HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 ceems_scrape_collector_success{collector="hwmon"} 1 -ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="ipmi"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 ceems_scrape_collector_success{collector="redfish"} 1 diff --git a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-k8s-nogpu-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-k8s-nogpu-output.txt index ed531d9a..b4f8fba0 100644 --- a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-k8s-nogpu-output.txt +++ b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-k8s-nogpu-output.txt @@ -193,18 +193,18 @@ ceems_cpu_seconds_total{hostname="",mode="system"} 1119.22 ceems_cpu_seconds_total{hostname="",mode="user"} 3018.54 # HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. # TYPE ceems_exporter_build_info gauge -# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts -# TYPE ceems_ipmi_dcmi_avg_watts gauge -ceems_ipmi_dcmi_avg_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts -# TYPE ceems_ipmi_dcmi_current_watts gauge -ceems_ipmi_dcmi_current_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts -# TYPE ceems_ipmi_dcmi_max_watts gauge -ceems_ipmi_dcmi_max_watts{hostname=""} 6132 -# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts -# TYPE ceems_ipmi_dcmi_min_watts gauge -ceems_ipmi_dcmi_min_watts{hostname=""} 5748 +# HELP ceems_ipmi_dcmi_power_avg_watts Average power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_avg_watts gauge +ceems_ipmi_dcmi_power_avg_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_current_watts Current power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_current_watts gauge +ceems_ipmi_dcmi_power_current_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_max_watts Maximum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_max_watts gauge +ceems_ipmi_dcmi_power_max_watts{hostname=""} 6132 +# HELP ceems_ipmi_dcmi_power_min_watts Minimum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_min_watts gauge +ceems_ipmi_dcmi_power_min_watts{hostname=""} 5748 # HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. # TYPE ceems_meminfo_MemAvailable_bytes gauge ceems_meminfo_MemAvailable_bytes{hostname=""} 0 @@ -235,7 +235,7 @@ ceems_rapl_package_power_limit_watts_total{hostname="",index="1",path="pkg/colle # HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 -ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="ipmi"} 1 ceems_scrape_collector_success{collector="k8s"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 diff --git a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-k8s-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-k8s-output.txt index a4417a5e..04ee2dd3 100644 --- a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-k8s-output.txt +++ b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-k8s-output.txt @@ -219,18 +219,18 @@ ceems_cpu_seconds_total{hostname="",mode="system"} 1119.22 ceems_cpu_seconds_total{hostname="",mode="user"} 3018.54 # HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. # TYPE ceems_exporter_build_info gauge -# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts -# TYPE ceems_ipmi_dcmi_avg_watts gauge -ceems_ipmi_dcmi_avg_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts -# TYPE ceems_ipmi_dcmi_current_watts gauge -ceems_ipmi_dcmi_current_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts -# TYPE ceems_ipmi_dcmi_max_watts gauge -ceems_ipmi_dcmi_max_watts{hostname=""} 6132 -# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts -# TYPE ceems_ipmi_dcmi_min_watts gauge -ceems_ipmi_dcmi_min_watts{hostname=""} 5748 +# HELP ceems_ipmi_dcmi_power_avg_watts Average power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_avg_watts gauge +ceems_ipmi_dcmi_power_avg_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_current_watts Current power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_current_watts gauge +ceems_ipmi_dcmi_power_current_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_max_watts Maximum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_max_watts gauge +ceems_ipmi_dcmi_power_max_watts{hostname=""} 6132 +# HELP ceems_ipmi_dcmi_power_min_watts Minimum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_min_watts gauge +ceems_ipmi_dcmi_power_min_watts{hostname=""} 5748 # HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. # TYPE ceems_meminfo_MemAvailable_bytes gauge ceems_meminfo_MemAvailable_bytes{hostname=""} 0 @@ -256,28 +256,28 @@ ceems_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/testda # TYPE ceems_rapl_package_power_limit_watts_total counter ceems_rapl_package_power_limit_watts_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0"} 180 ceems_rapl_package_power_limit_watts_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1"} 180 -# HELP ceems_redfish_avg_watts Average Power consumption in watts -# TYPE ceems_redfish_avg_watts gauge -ceems_redfish_avg_watts{chassis="Chassis_1",hostname=""} 365 -ceems_redfish_avg_watts{chassis="Chassis_2",hostname=""} 1734 -# HELP ceems_redfish_current_watts Current Power consumption in watts -# TYPE ceems_redfish_current_watts gauge -ceems_redfish_current_watts{chassis="Chassis_1",hostname=""} 397 -ceems_redfish_current_watts{chassis="Chassis_2",hostname=""} 1696 -# HELP ceems_redfish_max_watts Maximum Power consumption in watts -# TYPE ceems_redfish_max_watts gauge -ceems_redfish_max_watts{chassis="Chassis_1",hostname=""} 609 -ceems_redfish_max_watts{chassis="Chassis_2",hostname=""} 2155 -# HELP ceems_redfish_min_watts Minimum Power consumption in watts -# TYPE ceems_redfish_min_watts gauge -ceems_redfish_min_watts{chassis="Chassis_1",hostname=""} 326 -ceems_redfish_min_watts{chassis="Chassis_2",hostname=""} 588 +# HELP ceems_redfish_power_avg_watts Average Power consumption in watts +# TYPE ceems_redfish_power_avg_watts gauge +ceems_redfish_power_avg_watts{chassis="Chassis_1",hostname=""} 365 +ceems_redfish_power_avg_watts{chassis="Chassis_2",hostname=""} 1734 +# HELP ceems_redfish_power_current_watts Current Power consumption in watts +# TYPE ceems_redfish_power_current_watts gauge +ceems_redfish_power_current_watts{chassis="Chassis_1",hostname=""} 397 +ceems_redfish_power_current_watts{chassis="Chassis_2",hostname=""} 1696 +# HELP ceems_redfish_power_max_watts Maximum Power consumption in watts +# TYPE ceems_redfish_power_max_watts gauge +ceems_redfish_power_max_watts{chassis="Chassis_1",hostname=""} 609 +ceems_redfish_power_max_watts{chassis="Chassis_2",hostname=""} 2155 +# HELP ceems_redfish_power_min_watts Minimum Power consumption in watts +# TYPE ceems_redfish_power_min_watts gauge +ceems_redfish_power_min_watts{chassis="Chassis_1",hostname=""} 326 +ceems_redfish_power_min_watts{chassis="Chassis_2",hostname=""} 588 # HELP ceems_scrape_collector_duration_seconds ceems_exporter: Duration of a collector scrape. # TYPE ceems_scrape_collector_duration_seconds gauge # HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 -ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="ipmi"} 1 ceems_scrape_collector_success{collector="k8s"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 diff --git a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-libvirt-nonsystemd-layout-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-libvirt-nonsystemd-layout-output.txt index 5592c0c0..feca8c13 100644 --- a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-libvirt-nonsystemd-layout-output.txt +++ b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-libvirt-nonsystemd-layout-output.txt @@ -130,18 +130,18 @@ ceems_cpu_seconds_total{hostname="",mode="system"} 1119.22 ceems_cpu_seconds_total{hostname="",mode="user"} 3018.54 # HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. # TYPE ceems_exporter_build_info gauge -# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts -# TYPE ceems_ipmi_dcmi_avg_watts gauge -ceems_ipmi_dcmi_avg_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts -# TYPE ceems_ipmi_dcmi_current_watts gauge -ceems_ipmi_dcmi_current_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts -# TYPE ceems_ipmi_dcmi_max_watts gauge -ceems_ipmi_dcmi_max_watts{hostname=""} 6132 -# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts -# TYPE ceems_ipmi_dcmi_min_watts gauge -ceems_ipmi_dcmi_min_watts{hostname=""} 5748 +# HELP ceems_ipmi_dcmi_power_avg_watts Average power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_avg_watts gauge +ceems_ipmi_dcmi_power_avg_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_current_watts Current power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_current_watts gauge +ceems_ipmi_dcmi_power_current_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_max_watts Maximum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_max_watts gauge +ceems_ipmi_dcmi_power_max_watts{hostname=""} 6132 +# HELP ceems_ipmi_dcmi_power_min_watts Minimum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_min_watts gauge +ceems_ipmi_dcmi_power_min_watts{hostname=""} 5748 # HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. # TYPE ceems_meminfo_MemAvailable_bytes gauge ceems_meminfo_MemAvailable_bytes{hostname=""} 0 @@ -167,28 +167,28 @@ ceems_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/testda # TYPE ceems_rapl_package_power_limit_watts_total counter ceems_rapl_package_power_limit_watts_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0"} 180 ceems_rapl_package_power_limit_watts_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1"} 180 -# HELP ceems_redfish_avg_watts Average Power consumption in watts -# TYPE ceems_redfish_avg_watts gauge -ceems_redfish_avg_watts{chassis="Chassis_1",hostname=""} 365 -ceems_redfish_avg_watts{chassis="Chassis_2",hostname=""} 1734 -# HELP ceems_redfish_current_watts Current Power consumption in watts -# TYPE ceems_redfish_current_watts gauge -ceems_redfish_current_watts{chassis="Chassis_1",hostname=""} 397 -ceems_redfish_current_watts{chassis="Chassis_2",hostname=""} 1696 -# HELP ceems_redfish_max_watts Maximum Power consumption in watts -# TYPE ceems_redfish_max_watts gauge -ceems_redfish_max_watts{chassis="Chassis_1",hostname=""} 609 -ceems_redfish_max_watts{chassis="Chassis_2",hostname=""} 2155 -# HELP ceems_redfish_min_watts Minimum Power consumption in watts -# TYPE ceems_redfish_min_watts gauge -ceems_redfish_min_watts{chassis="Chassis_1",hostname=""} 326 -ceems_redfish_min_watts{chassis="Chassis_2",hostname=""} 588 +# HELP ceems_redfish_power_avg_watts Average Power consumption in watts +# TYPE ceems_redfish_power_avg_watts gauge +ceems_redfish_power_avg_watts{chassis="Chassis_1",hostname=""} 365 +ceems_redfish_power_avg_watts{chassis="Chassis_2",hostname=""} 1734 +# HELP ceems_redfish_power_current_watts Current Power consumption in watts +# TYPE ceems_redfish_power_current_watts gauge +ceems_redfish_power_current_watts{chassis="Chassis_1",hostname=""} 397 +ceems_redfish_power_current_watts{chassis="Chassis_2",hostname=""} 1696 +# HELP ceems_redfish_power_max_watts Maximum Power consumption in watts +# TYPE ceems_redfish_power_max_watts gauge +ceems_redfish_power_max_watts{chassis="Chassis_1",hostname=""} 609 +ceems_redfish_power_max_watts{chassis="Chassis_2",hostname=""} 2155 +# HELP ceems_redfish_power_min_watts Minimum Power consumption in watts +# TYPE ceems_redfish_power_min_watts gauge +ceems_redfish_power_min_watts{chassis="Chassis_1",hostname=""} 326 +ceems_redfish_power_min_watts{chassis="Chassis_2",hostname=""} 588 # HELP ceems_scrape_collector_duration_seconds ceems_exporter: Duration of a collector scrape. # TYPE ceems_scrape_collector_duration_seconds gauge # HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 -ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="ipmi"} 1 ceems_scrape_collector_success{collector="libvirt"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 diff --git a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-libvirt-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-libvirt-output.txt index 67d486e0..c5fb8edc 100644 --- a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-libvirt-output.txt +++ b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-libvirt-output.txt @@ -139,18 +139,18 @@ ceems_cpu_seconds_total{hostname="",mode="system"} 1119.22 ceems_cpu_seconds_total{hostname="",mode="user"} 3018.54 # HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. # TYPE ceems_exporter_build_info gauge -# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts -# TYPE ceems_ipmi_dcmi_avg_watts gauge -ceems_ipmi_dcmi_avg_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts -# TYPE ceems_ipmi_dcmi_current_watts gauge -ceems_ipmi_dcmi_current_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts -# TYPE ceems_ipmi_dcmi_max_watts gauge -ceems_ipmi_dcmi_max_watts{hostname=""} 6132 -# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts -# TYPE ceems_ipmi_dcmi_min_watts gauge -ceems_ipmi_dcmi_min_watts{hostname=""} 5748 +# HELP ceems_ipmi_dcmi_power_avg_watts Average power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_avg_watts gauge +ceems_ipmi_dcmi_power_avg_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_current_watts Current power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_current_watts gauge +ceems_ipmi_dcmi_power_current_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_max_watts Maximum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_max_watts gauge +ceems_ipmi_dcmi_power_max_watts{hostname=""} 6132 +# HELP ceems_ipmi_dcmi_power_min_watts Minimum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_min_watts gauge +ceems_ipmi_dcmi_power_min_watts{hostname=""} 5748 # HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. # TYPE ceems_meminfo_MemAvailable_bytes gauge ceems_meminfo_MemAvailable_bytes{hostname=""} 0 @@ -176,28 +176,28 @@ ceems_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/testda # TYPE ceems_rapl_package_power_limit_watts_total counter ceems_rapl_package_power_limit_watts_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0"} 180 ceems_rapl_package_power_limit_watts_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1"} 180 -# HELP ceems_redfish_avg_watts Average Power consumption in watts -# TYPE ceems_redfish_avg_watts gauge -ceems_redfish_avg_watts{chassis="Chassis_1",hostname=""} 365 -ceems_redfish_avg_watts{chassis="Chassis_2",hostname=""} 1734 -# HELP ceems_redfish_current_watts Current Power consumption in watts -# TYPE ceems_redfish_current_watts gauge -ceems_redfish_current_watts{chassis="Chassis_1",hostname=""} 397 -ceems_redfish_current_watts{chassis="Chassis_2",hostname=""} 1696 -# HELP ceems_redfish_max_watts Maximum Power consumption in watts -# TYPE ceems_redfish_max_watts gauge -ceems_redfish_max_watts{chassis="Chassis_1",hostname=""} 609 -ceems_redfish_max_watts{chassis="Chassis_2",hostname=""} 2155 -# HELP ceems_redfish_min_watts Minimum Power consumption in watts -# TYPE ceems_redfish_min_watts gauge -ceems_redfish_min_watts{chassis="Chassis_1",hostname=""} 326 -ceems_redfish_min_watts{chassis="Chassis_2",hostname=""} 588 +# HELP ceems_redfish_power_avg_watts Average Power consumption in watts +# TYPE ceems_redfish_power_avg_watts gauge +ceems_redfish_power_avg_watts{chassis="Chassis_1",hostname=""} 365 +ceems_redfish_power_avg_watts{chassis="Chassis_2",hostname=""} 1734 +# HELP ceems_redfish_power_current_watts Current Power consumption in watts +# TYPE ceems_redfish_power_current_watts gauge +ceems_redfish_power_current_watts{chassis="Chassis_1",hostname=""} 397 +ceems_redfish_power_current_watts{chassis="Chassis_2",hostname=""} 1696 +# HELP ceems_redfish_power_max_watts Maximum Power consumption in watts +# TYPE ceems_redfish_power_max_watts gauge +ceems_redfish_power_max_watts{chassis="Chassis_1",hostname=""} 609 +ceems_redfish_power_max_watts{chassis="Chassis_2",hostname=""} 2155 +# HELP ceems_redfish_power_min_watts Minimum Power consumption in watts +# TYPE ceems_redfish_power_min_watts gauge +ceems_redfish_power_min_watts{chassis="Chassis_1",hostname=""} 326 +ceems_redfish_power_min_watts{chassis="Chassis_2",hostname=""} 588 # HELP ceems_scrape_collector_duration_seconds ceems_exporter: Duration of a collector scrape. # TYPE ceems_scrape_collector_duration_seconds gauge # HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 -ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="ipmi"} 1 ceems_scrape_collector_success{collector="libvirt"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 diff --git a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nogpu-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nogpu-output.txt index d11c4ef6..8a6af0f4 100644 --- a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nogpu-output.txt +++ b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nogpu-output.txt @@ -272,18 +272,18 @@ ceems_infiniband_vl15_dropped_total{device="hfi1_0",port="1"} 0 ceems_infiniband_vl15_dropped_total{device="mlx4_0",port="1"} 0 ceems_infiniband_vl15_dropped_total{device="mlx4_0",port="2"} 0 ceems_infiniband_vl15_dropped_total{device="mlx5_0",port="1"} 0 -# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts -# TYPE ceems_ipmi_dcmi_avg_watts gauge -ceems_ipmi_dcmi_avg_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts -# TYPE ceems_ipmi_dcmi_current_watts gauge -ceems_ipmi_dcmi_current_watts{hostname=""} 5942 -# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts -# TYPE ceems_ipmi_dcmi_max_watts gauge -ceems_ipmi_dcmi_max_watts{hostname=""} 6132 -# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts -# TYPE ceems_ipmi_dcmi_min_watts gauge -ceems_ipmi_dcmi_min_watts{hostname=""} 5748 +# HELP ceems_ipmi_dcmi_power_avg_watts Average power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_avg_watts gauge +ceems_ipmi_dcmi_power_avg_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_current_watts Current power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_current_watts gauge +ceems_ipmi_dcmi_power_current_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_power_max_watts Maximum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_max_watts gauge +ceems_ipmi_dcmi_power_max_watts{hostname=""} 6132 +# HELP ceems_ipmi_dcmi_power_min_watts Minimum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_min_watts gauge +ceems_ipmi_dcmi_power_min_watts{hostname=""} 5748 # HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. # TYPE ceems_meminfo_MemAvailable_bytes gauge ceems_meminfo_MemAvailable_bytes{hostname=""} 0 @@ -315,7 +315,7 @@ ceems_rapl_package_power_limit_watts_total{hostname="",index="1",path="pkg/colle # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 ceems_scrape_collector_success{collector="infiniband"} 1 -ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="ipmi"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 ceems_scrape_collector_success{collector="slurm"} 1 diff --git a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nvidia-gpu-reordering.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nvidia-gpu-reordering.txt index 892f0edc..009980de 100644 --- a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nvidia-gpu-reordering.txt +++ b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nvidia-gpu-reordering.txt @@ -176,18 +176,18 @@ ceems_cray_pm_counters_power_watts{domain="node",hostname=""} 873 ceems_cray_pm_counters_temp_celsius{domain="cpu0",hostname=""} 48 # HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. # TYPE ceems_exporter_build_info gauge -# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts -# TYPE ceems_ipmi_dcmi_avg_watts gauge -ceems_ipmi_dcmi_avg_watts{hostname=""} 49 -# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts -# TYPE ceems_ipmi_dcmi_current_watts gauge -ceems_ipmi_dcmi_current_watts{hostname=""} 49 -# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts -# TYPE ceems_ipmi_dcmi_max_watts gauge -ceems_ipmi_dcmi_max_watts{hostname=""} 304 -# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts -# TYPE ceems_ipmi_dcmi_min_watts gauge -ceems_ipmi_dcmi_min_watts{hostname=""} 6 +# HELP ceems_ipmi_dcmi_power_avg_watts Average power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_avg_watts gauge +ceems_ipmi_dcmi_power_avg_watts{hostname=""} 49 +# HELP ceems_ipmi_dcmi_power_current_watts Current power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_current_watts gauge +ceems_ipmi_dcmi_power_current_watts{hostname=""} 49 +# HELP ceems_ipmi_dcmi_power_max_watts Maximum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_max_watts gauge +ceems_ipmi_dcmi_power_max_watts{hostname=""} 304 +# HELP ceems_ipmi_dcmi_power_min_watts Minimum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_min_watts gauge +ceems_ipmi_dcmi_power_min_watts{hostname=""} 6 # HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. # TYPE ceems_meminfo_MemAvailable_bytes gauge ceems_meminfo_MemAvailable_bytes{hostname=""} 0 @@ -269,7 +269,7 @@ ceems_rdma_state_id{device="mlx5_0",hostname="",manager="slurm",port="1"} 4 # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 ceems_scrape_collector_success{collector="cray_pm_counters"} 1 -ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="ipmi"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 ceems_scrape_collector_success{collector="slurm"} 1 diff --git a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt index 7271e5ce..7dad68b1 100644 --- a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt +++ b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt @@ -155,18 +155,18 @@ ceems_cpu_seconds_total{hostname="",mode="system"} 1119.22 ceems_cpu_seconds_total{hostname="",mode="user"} 3018.54 # HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. # TYPE ceems_exporter_build_info gauge -# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts -# TYPE ceems_ipmi_dcmi_avg_watts gauge -ceems_ipmi_dcmi_avg_watts{hostname=""} 49 -# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts -# TYPE ceems_ipmi_dcmi_current_watts gauge -ceems_ipmi_dcmi_current_watts{hostname=""} 49 -# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts -# TYPE ceems_ipmi_dcmi_max_watts gauge -ceems_ipmi_dcmi_max_watts{hostname=""} 304 -# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts -# TYPE ceems_ipmi_dcmi_min_watts gauge -ceems_ipmi_dcmi_min_watts{hostname=""} 6 +# HELP ceems_ipmi_dcmi_power_avg_watts Average power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_avg_watts gauge +ceems_ipmi_dcmi_power_avg_watts{hostname=""} 49 +# HELP ceems_ipmi_dcmi_power_current_watts Current power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_current_watts gauge +ceems_ipmi_dcmi_power_current_watts{hostname=""} 49 +# HELP ceems_ipmi_dcmi_power_max_watts Maximum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_max_watts gauge +ceems_ipmi_dcmi_power_max_watts{hostname=""} 304 +# HELP ceems_ipmi_dcmi_power_min_watts Minimum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_min_watts gauge +ceems_ipmi_dcmi_power_min_watts{hostname=""} 6 # HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. # TYPE ceems_meminfo_MemAvailable_bytes gauge ceems_meminfo_MemAvailable_bytes{hostname=""} 0 @@ -290,28 +290,28 @@ ceems_rdma_state_id{device="hfi1_0",hostname="",manager="slurm",port="1"} 4 ceems_rdma_state_id{device="mlx4_0",hostname="",manager="slurm",port="1"} 4 ceems_rdma_state_id{device="mlx4_0",hostname="",manager="slurm",port="2"} 4 ceems_rdma_state_id{device="mlx5_0",hostname="",manager="slurm",port="1"} 4 -# HELP ceems_redfish_avg_watts Average Power consumption in watts -# TYPE ceems_redfish_avg_watts gauge -ceems_redfish_avg_watts{chassis="Chassis_1",hostname=""} 365 -ceems_redfish_avg_watts{chassis="Chassis_2",hostname=""} 1734 -# HELP ceems_redfish_current_watts Current Power consumption in watts -# TYPE ceems_redfish_current_watts gauge -ceems_redfish_current_watts{chassis="Chassis_1",hostname=""} 397 -ceems_redfish_current_watts{chassis="Chassis_2",hostname=""} 1696 -# HELP ceems_redfish_max_watts Maximum Power consumption in watts -# TYPE ceems_redfish_max_watts gauge -ceems_redfish_max_watts{chassis="Chassis_1",hostname=""} 609 -ceems_redfish_max_watts{chassis="Chassis_2",hostname=""} 2155 -# HELP ceems_redfish_min_watts Minimum Power consumption in watts -# TYPE ceems_redfish_min_watts gauge -ceems_redfish_min_watts{chassis="Chassis_1",hostname=""} 326 -ceems_redfish_min_watts{chassis="Chassis_2",hostname=""} 588 +# HELP ceems_redfish_power_avg_watts Average Power consumption in watts +# TYPE ceems_redfish_power_avg_watts gauge +ceems_redfish_power_avg_watts{chassis="Chassis_1",hostname=""} 365 +ceems_redfish_power_avg_watts{chassis="Chassis_2",hostname=""} 1734 +# HELP ceems_redfish_power_current_watts Current Power consumption in watts +# TYPE ceems_redfish_power_current_watts gauge +ceems_redfish_power_current_watts{chassis="Chassis_1",hostname=""} 397 +ceems_redfish_power_current_watts{chassis="Chassis_2",hostname=""} 1696 +# HELP ceems_redfish_power_max_watts Maximum Power consumption in watts +# TYPE ceems_redfish_power_max_watts gauge +ceems_redfish_power_max_watts{chassis="Chassis_1",hostname=""} 609 +ceems_redfish_power_max_watts{chassis="Chassis_2",hostname=""} 2155 +# HELP ceems_redfish_power_min_watts Minimum Power consumption in watts +# TYPE ceems_redfish_power_min_watts gauge +ceems_redfish_power_min_watts{chassis="Chassis_1",hostname=""} 326 +ceems_redfish_power_min_watts{chassis="Chassis_2",hostname=""} 588 # HELP ceems_scrape_collector_duration_seconds ceems_exporter: Duration of a collector scrape. # TYPE ceems_scrape_collector_duration_seconds gauge # HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 -ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="ipmi"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="netdev"} 1 ceems_scrape_collector_success{collector="rapl"} 1 diff --git a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-procfs-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-procfs-output.txt index 18ed2c47..979a176c 100644 --- a/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-procfs-output.txt +++ b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-procfs-output.txt @@ -176,18 +176,18 @@ ceems_cray_pm_counters_power_watts{domain="node",hostname=""} 873 ceems_cray_pm_counters_temp_celsius{domain="cpu0",hostname=""} 48 # HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. # TYPE ceems_exporter_build_info gauge -# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts -# TYPE ceems_ipmi_dcmi_avg_watts gauge -ceems_ipmi_dcmi_avg_watts{hostname=""} 49 -# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts -# TYPE ceems_ipmi_dcmi_current_watts gauge -ceems_ipmi_dcmi_current_watts{hostname=""} 49 -# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts -# TYPE ceems_ipmi_dcmi_max_watts gauge -ceems_ipmi_dcmi_max_watts{hostname=""} 304 -# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts -# TYPE ceems_ipmi_dcmi_min_watts gauge -ceems_ipmi_dcmi_min_watts{hostname=""} 6 +# HELP ceems_ipmi_dcmi_power_avg_watts Average power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_avg_watts gauge +ceems_ipmi_dcmi_power_avg_watts{hostname=""} 49 +# HELP ceems_ipmi_dcmi_power_current_watts Current power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_current_watts gauge +ceems_ipmi_dcmi_power_current_watts{hostname=""} 49 +# HELP ceems_ipmi_dcmi_power_max_watts Maximum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_max_watts gauge +ceems_ipmi_dcmi_power_max_watts{hostname=""} 304 +# HELP ceems_ipmi_dcmi_power_min_watts Minimum power consumption reported by DCMI in watts +# TYPE ceems_ipmi_dcmi_power_min_watts gauge +ceems_ipmi_dcmi_power_min_watts{hostname=""} 6 # HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. # TYPE ceems_meminfo_MemAvailable_bytes gauge ceems_meminfo_MemAvailable_bytes{hostname=""} 0 @@ -219,7 +219,7 @@ ceems_rapl_package_power_limit_watts_total{hostname="",index="1",path="pkg/colle # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 ceems_scrape_collector_success{collector="cray_pm_counters"} 1 -ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="ipmi"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 ceems_scrape_collector_success{collector="slurm"} 1 diff --git a/pkg/ipmi/client.go b/pkg/ipmi/client.go index 3e4eed38..7adbb788 100644 --- a/pkg/ipmi/client.go +++ b/pkg/ipmi/client.go @@ -3,6 +3,8 @@ package ipmi import ( + "bytes" + "encoding/binary" "errors" "fmt" "log/slog" @@ -24,19 +26,35 @@ const ( IPMI_BMC_CHANNEL = 0xF //nolint:stylecheck ) +type Client interface { + Do(r *Request) (*Response, error) + Close() error + DCMIPowerReading() (*PowerReading, error) + LanIP() (*string, error) + SensorRecords() ([]*FullSensorRecord, error) + SensorReadings(records []*FullSensorRecord) (map[*FullSensorRecord]float64, error) +} + +type Config struct { + Logger *slog.Logger + DevNum int + Timeout time.Duration +} + type timeout struct { value time.Duration } -type IPMIClient struct { - Logger *slog.Logger - DevFile *os.File - BMCAddr ipmiSystemInterfaceAddr +type ipmiClient struct { + logger *slog.Logger + devFile *os.File + bmcAddr ipmiSystemInterfaceAddr + timeout time.Duration } -// NewIPMIClient returns a new instance of IPMIClient struct. -func NewIPMIClient(devNum int, logger *slog.Logger) (*IPMIClient, error) { - if devNum < 0 { +// NewClient returns a new instance of Client struct. +func NewClient(c *Config) (Client, error) { + if c.DevNum < 0 { return nil, errors.New("device number for IPMI must be greater than zero") } @@ -47,8 +65,8 @@ func NewIPMIClient(devNum int, logger *slog.Logger) (*IPMIClient, error) { var devFile *os.File for _, d := range ipmiDevs { - if f, err := os.Open(fmt.Sprintf(d, devNum)); err == nil { - logger.Debug("IPMI device found", "device", fmt.Sprintf(d, devNum)) + if f, err := os.Open(fmt.Sprintf(d, c.DevNum)); err == nil { + c.Logger.Debug("IPMI device found", "device", fmt.Sprintf(d, c.DevNum)) devFile = f @@ -67,25 +85,34 @@ func NewIPMIClient(devNum int, logger *slog.Logger) (*IPMIClient, error) { return nil, fmt.Errorf("failed to enable IPMI event receiver: %w", errno) } - return &IPMIClient{ - Logger: logger, - DevFile: devFile, - BMCAddr: ipmiSystemInterfaceAddr{ + // Set a valid timeout + if c.Timeout == 0 { + c.Timeout = time.Second + } + + // Instantitate client + client := &ipmiClient{ + logger: c.Logger, + devFile: devFile, + bmcAddr: ipmiSystemInterfaceAddr{ AddrType: IPMI_SYSTEM_INTERFACE_ADDR_TYPE, Channel: IPMI_BMC_CHANNEL, Lun: 0x0, }, - }, nil + timeout: c.Timeout, + } + + return client, nil } // Do sends IPMI request and returns the response. -func (i *IPMIClient) Do(req *ipmiReq, t time.Duration) (*ipmiResp, error) { +func (i *ipmiClient) Do(req *Request) (*Response, error) { // Device file descriptor - fd := i.DevFile.Fd() + fd := i.devFile.Fd() // Send request if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, fd, IPMICTL_SEND_COMMAND, uintptr(unsafe.Pointer(req))); errno != 0 { - i.Logger.Error("Failed to send IPMI request", "err", errno) + i.logger.Error("Failed to send IPMI request", "err", errno) return nil, fmt.Errorf("failed to send IPMI request: %w", errno) } @@ -102,7 +129,7 @@ func (i *IPMIClient) Do(req *ipmiReq, t time.Duration) (*ipmiResp, error) { FDZero(&activeFdSet) FDSet(fd, &activeFdSet) - resp := ipmiResp{} + resp := Response{} addr := ipmiAddr{} recv := ipmiRecv{ Addr: uintptr(unsafe.Pointer(&addr)), @@ -114,44 +141,49 @@ func (i *IPMIClient) Do(req *ipmiReq, t time.Duration) (*ipmiResp, error) { } // Set timeout for select - timeout := timeout{t} + timeout := timeout{i.timeout} _, err := unix.Select(serverFD+1, &activeFdSet, nil, nil, timeout.timeval()) if err != nil { - i.Logger.Error("Failed to receive response from IPMI device interface", "err", err) + i.logger.Error("Failed to receive response from IPMI device interface", "err", err) return nil, fmt.Errorf("failed to receive response from IPMI device interface: %w", err) } // Check if fd is ready to read if !FDIsSet(fd, &activeFdSet) { - i.Logger.Error("No response received from IPMI device interface") + i.logger.Error("No response received from IPMI device interface") return nil, errors.New("no response received from IPMI device interface") } // Read data into recv struct if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, fd, IPMICTL_RECEIVE_MSG_TRUNC, uintptr(unsafe.Pointer(&recv))); errno != 0 { - i.Logger.Error("Failed to read response from IPMI device interface", "err", errno) + i.logger.Error("Failed to read response from IPMI device interface", "err", errno) return nil, fmt.Errorf("failed to read response from IPMI device interface: %w", errno) } // If Msgids match between response and request break if req.Msgid != recv.Msgid { - i.Logger.Error("Received response with unexpected ID", "req_id", req.Msgid, "resp_id", recv.Msgid) + i.logger.Error("Received response with unexpected ID", "req_id", req.Msgid, "resp_id", recv.Msgid) return nil, fmt.Errorf("received response with unexpected id: %d", recv.Msgid) } // Read response data resp.DataLen = int32(recv.Msg.DataLen) - i.Logger.Debug("IPMI response data", "data", resp.Data[0:resp.DataLen]) + // i.logger.Debug("IPMI response data", "data", resp.Data[0:resp.DataLen]) + + // Check completion code + if err := binary.Read(bytes.NewReader(resp.Data[0:1]), binary.BigEndian, &resp.Ccode); err == nil && resp.Ccode != 0 { + return nil, errors.New("received non zero completion code in IPMI response") + } return &resp, nil } // Close IPMI device file. -func (i *IPMIClient) Close() error { - return i.DevFile.Close() +func (i *ipmiClient) Close() error { + return i.devFile.Close() } diff --git a/pkg/ipmi/dcmi.go b/pkg/ipmi/dcmi.go index 2206aa0b..b32cd4f6 100644 --- a/pkg/ipmi/dcmi.go +++ b/pkg/ipmi/dcmi.go @@ -1,11 +1,8 @@ package ipmi import ( - "bytes" "encoding/binary" - "errors" "fmt" - "time" "unsafe" ) @@ -18,19 +15,19 @@ const ( ) type PowerReading struct { - Minimum, Maximum, Average, Current uint16 + Minimum, Maximum, Average, Current float64 Activated bool } // PowerReading returns the current IPMI DCMI power reading. -func (i *IPMIClient) PowerReading(timeout time.Duration) (*PowerReading, error) { +func (i *ipmiClient) DCMIPowerReading() (*PowerReading, error) { // Request payload msgData := [4]uint8{IPMI_DCMI, 0x1, 0x0, 0x0} // IPMI Request - req := ipmiReq{ - Addr: uintptr(unsafe.Pointer(&i.BMCAddr)), - AddrLen: uint(unsafe.Sizeof(i.BMCAddr)), + req := Request{ + Addr: uintptr(unsafe.Pointer(&i.bmcAddr)), + AddrLen: uint(unsafe.Sizeof(i.bmcAddr)), Msgid: 1, Msg: ipmiMsg{ Data: uintptr(unsafe.Pointer(&msgData[0])), @@ -41,25 +38,19 @@ func (i *IPMIClient) PowerReading(timeout time.Duration) (*PowerReading, error) } // Do request and read response - resp, err := i.Do(&req, timeout) + resp, err := i.Do(&req) if err != nil { - i.Logger.Error("Failed to make IPMI request", "err", err) + i.logger.Error("Failed to make IPMI request to get DCMI reading", "err", err) - return nil, fmt.Errorf("failed to make IPMI request: %w", err) - } - - // Check completion code - var completionCode uint16 - if err := binary.Read(bytes.NewReader(resp.Data[0:1]), binary.BigEndian, &completionCode); err == nil && completionCode != 0 { - return nil, errors.New("received non zero completion code for IPMI power readings response") + return nil, fmt.Errorf("failed to make ipmi request for dcmi reading: %w", err) } // Get readings return &PowerReading{ - Current: binary.LittleEndian.Uint16(resp.Data[2:4]), - Minimum: binary.LittleEndian.Uint16(resp.Data[4:6]), - Maximum: binary.LittleEndian.Uint16(resp.Data[6:8]), - Average: binary.LittleEndian.Uint16(resp.Data[8:10]), + Current: float64(binary.LittleEndian.Uint16(resp.Data[2:4])), + Minimum: float64(binary.LittleEndian.Uint16(resp.Data[4:6])), + Maximum: float64(binary.LittleEndian.Uint16(resp.Data[6:8])), + Average: float64(binary.LittleEndian.Uint16(resp.Data[8:10])), Activated: resp.Data[18] == IPMI_DCMI_ACTIVATED, }, nil } diff --git a/pkg/ipmi/lan.go b/pkg/ipmi/lan.go index 39483a1c..f03181e5 100644 --- a/pkg/ipmi/lan.go +++ b/pkg/ipmi/lan.go @@ -1,11 +1,7 @@ package ipmi import ( - "bytes" - "encoding/binary" - "errors" "fmt" - "time" "unsafe" ) @@ -17,14 +13,14 @@ const ( ) // LanIP returns the IP address of BMC. -func (i *IPMIClient) LanIP(timeout time.Duration) (*string, error) { +func (i *ipmiClient) LanIP() (*string, error) { // Request payload msgData := [4]uint8{IPMI_LAN, 0x3, 0x0, 0x0} // IPMI Request - req := ipmiReq{ - Addr: uintptr(unsafe.Pointer(&i.BMCAddr)), - AddrLen: uint(unsafe.Sizeof(i.BMCAddr)), + req := Request{ + Addr: uintptr(unsafe.Pointer(&i.bmcAddr)), + AddrLen: uint(unsafe.Sizeof(i.bmcAddr)), Msgid: 1, Msg: ipmiMsg{ Data: uintptr(unsafe.Pointer(&msgData[0])), @@ -35,17 +31,11 @@ func (i *IPMIClient) LanIP(timeout time.Duration) (*string, error) { } // Do request and read response - resp, err := i.Do(&req, timeout) + resp, err := i.Do(&req) if err != nil { - i.Logger.Error("Failed to make IPMI request", "err", err) + i.logger.Error("Failed to make IPMI request to get LAN IP", "err", err) - return nil, fmt.Errorf("failed to make IPMI request: %w", err) - } - - // Check completion code - var completionCode uint16 - if err := binary.Read(bytes.NewReader(resp.Data[0:1]), binary.BigEndian, &completionCode); err == nil && completionCode != 0 { - return nil, errors.New("received non zero completion code for IPMI LAN IP response") + return nil, fmt.Errorf("failed to make ipmi request to get lan ip: %w", err) } // Get LAN IP diff --git a/pkg/ipmi/sensors.go b/pkg/ipmi/sensors.go new file mode 100644 index 00000000..4b419635 --- /dev/null +++ b/pkg/ipmi/sensors.go @@ -0,0 +1,115 @@ +package ipmi + +import ( + "errors" + "fmt" + "unsafe" +) + +// IPMI sensor related constants. +const ( + IPMI_SENSOR_RECORD_CMD = 0x23 //nolint:stylecheck + IPMI_SENSOR_RECORD_NETFN = 0xa //nolint:stylecheck + IPMI_SENSOR_READING_CMD = 0x2d //nolint:stylecheck + IPMI_SENSOR_READING_NETFN = 0x4 //nolint:stylecheck +) + +// SensorRecords returns full sensor records info. +func (i *ipmiClient) SensorRecords() ([]*FullSensorRecord, error) { + // All errors + var errs error + + var recordID uint8 = 0 + + var fullSensorRecords []*FullSensorRecord + + for { + // Request payload + msgData := [6]uint8{0x0, 0x0, recordID, 0x0, 0x0, 0xff} + + // IPMI Request + req := Request{ + Addr: uintptr(unsafe.Pointer(&i.bmcAddr)), + AddrLen: uint(unsafe.Sizeof(i.bmcAddr)), + Msgid: 1, + Msg: ipmiMsg{ + Data: uintptr(unsafe.Pointer(&msgData[0])), + DataLen: 6, + Netfn: IPMI_SENSOR_RECORD_NETFN, + Cmd: IPMI_SENSOR_RECORD_CMD, + }, + } + + // Do request and read response + resp, err := i.Do(&req) + if err != nil { + errs = errors.Join(errs, fmt.Errorf("failed to make ipmi request to get sensor record %d: %w", recordID, err)) + + continue + } + + sensorRecord := &FullSensorRecord{} + if err := sensorRecord.DecodeFromBytes(resp.Data[:]); err != nil { + errs = errors.Join(errs, fmt.Errorf("failed to decode sensor record %d: %w", recordID, err)) + + continue + } + + i.logger.Debug( + "Full sensor record", "record_id", recordID, "sensor_number", sensorRecord.Number, + "description", sensorRecord.Identity, "units", sensorRecord.BaseUnit, + ) + + fullSensorRecords = append(fullSensorRecords, sensorRecord) + + // Next recordID + recordID = resp.Data[1] + + // If recordID reaches 255, we are at the end of the list + if recordID == 255 { + break + } + } + + return fullSensorRecords, errs +} + +// SensorReadings returns readings of sensors of given IDs. +func (i *ipmiClient) SensorReadings(sensorRecords []*FullSensorRecord) (map[*FullSensorRecord]float64, error) { + // Initialise sensor readings map + readings := make(map[*FullSensorRecord]float64, len(sensorRecords)) + + var errs error + + // Get reading for every sensor record + for _, record := range sensorRecords { + // Request payload + msgData := [1]uint8{record.Number} + + // IPMI Request + req := Request{ + Addr: uintptr(unsafe.Pointer(&i.bmcAddr)), + AddrLen: uint(unsafe.Sizeof(i.bmcAddr)), + Msgid: 1, + Msg: ipmiMsg{ + Data: uintptr(unsafe.Pointer(&msgData[0])), + DataLen: 1, + Netfn: IPMI_SENSOR_READING_NETFN, + Cmd: IPMI_SENSOR_READING_CMD, + }, + } + + // Do request and read response + resp, err := i.Do(&req) + if err != nil { + i.logger.Error("Failed to make IPMI request to get reading of sensor", "sensor", record.Identity, "err", err) + errs = errors.Join(errs, fmt.Errorf("failed to make ipmi request to get reading of sensor %s: %w", record.Identity, err)) + + continue + } + + readings[record] = record.ConvertReading(int16(resp.Data[1])) + } + + return readings, errs +} diff --git a/pkg/ipmi/types.go b/pkg/ipmi/types.go index f8711a86..7cb7a8bf 100644 --- a/pkg/ipmi/types.go +++ b/pkg/ipmi/types.go @@ -1,15 +1,19 @@ package ipmi -type Msg struct { - Netfn uint8 - Lun uint8 - Cmd uint8 - TargetCmd uint8 - DataLen uint16 - Data uintptr +import ( + "errors" + "fmt" + "math" +) + +type Request struct { + Addr uintptr + AddrLen uint + Msgid int + Msg ipmiMsg } -type ipmiResp struct { +type Response struct { Ccode uint8 Data [1024]uint8 DataLen int32 @@ -42,9 +46,752 @@ type ipmiSystemInterfaceAddr struct { Lun uint8 } -type ipmiReq struct { - Addr uintptr - AddrLen uint - Msgid int - Msg ipmiMsg +// Constants. +const ( + unknown = "Unknown" +) + +// All of the below structs and conversions are nicked from +// https://github.com/gebn/bmc project. They are modified to current +// library needs. + +// ConversionFactors contains inputs to the linear formula in 30.3 and 36.3 of +// v1.5 and v2.0 respectively. This struct exists as conversion factors can come +// from two sources: full sensor records, and the Get Sensor Reading Factors +// command response. In practice, we get them from the former for linear and +// linearised sensors, as these have constant factors. We need to obtain them +// from the Get Sensor Reading Factors command for non-linear sensors, as they +// vary by reading here. Both FullSensorRecord and GetSensorReadingFactorsRsp +// embed this type. +// +// Note that we split application of the formula into "conversion" and +// "linearisation". Conversion happens first, and is the linear formula applied +// to the raw value. The linearisation step, which is a no-op for linear and +// non-linear sensors, applies one of the formulae in the specification to the +// result of the conversion. This struct only deals with conversion; see +// Lineariser for linearisation. +type ConversionFactors struct { + // M is the constant multiplier. This is a 10-bit 2's complement number on + // the wire. + M int16 + + // B is the additive offset. This is a 10-bit 2's complement number on the + // wire. + B int16 + + // BExp is the exponent, controlling the location of the decimal point in B. + // This is also referred to as K1 in the spec, and is a 4-bit 2's complement + // number on the wire. + BExp int8 + + // RExp is the result exponent, controlling the location of the decimal + // point in the result of the linear formula and hence input to the + // linearisation function. This is also referred to as K2 in the spec, and + // is a 4-bit 2's complement number on the wire. + RExp int8 +} + +// ConvertReading applies the linear formula to a raw sensor reading, without +// the linearisation formula. It is independent of unit. This method takes an +// int16 rather than uint8 as raw values can be in 1 or 2's complement, or +// unsigned, so it must accept from -128 (lowest 2's complement) to 255 (highest +// unsigned). The conversion from the raw format to a native int must be done +// before calling this method. +func (f *ConversionFactors) ConvertReading(raw int16) float64 { + mX := int64(f.M) * int64(raw) + b10k1 := float64(f.B) * math.Pow10(int(f.BExp)) + + return (float64(mX) + b10k1) * math.Pow10(int(f.RExp)) +} + +// FullSensorRecord is specified in 37.1 and 43.1 of v1.5 and v2.0 respectively. +// It describes any type of sensor, and is the only record type that can +// describe a sensor generating analogue (i.e. non-enumerated/discrete) +// readings, e.g. a temperature sensor. It is specified as 64 bytes. This layer +// represents the record key and record body sections. +type FullSensorRecord struct { + ConversionFactors + + // Sensor number that will be used in request to get reading + Number uint8 + + // BaseUnit gives the primary unit of the sensor's reading, e.g. Celsius or + // Fahrenheit for a temperature sensor. + BaseUnit SensorUnit + + // ModifierUnit is contained in the Sensor Units 3 field. Note this is + // distinct from the identically-named 2-bit field in Sensor Units 1. 0x0 + // means unused. + ModifierUnit SensorUnit + + // Linearisation indicates whether the sensor is linear, linearised or + // non-linear. This controls post-processing after applying the linear + // conversion formula to the raw reading. + Linearisation Linearisation + + // Tolerance gives the absolute accuracy of the sensor in +/- half raw + // counts. This is a 6-bit uint on the wire. + Tolerance uint8 + + // Accuracy gives the sensor accuracy in 0.01% increments when raised to + // AccuracyExp. This is a 10-bit int on the wire. + Accuracy int16 + + // AccuracyExp is the quantity Accuracy is raised to the power of to give + // the final accuracy. + AccuracyExp uint8 + + // Identity is a descriptive string for the sensor. This can be up to 16 + // bytes long, which translates into 16-32 characters depending on the + // format used. There are no conventions around this, and it is provided for + // informational purposes only. Contrary to the name, attempting to identify + // sensors based on this value is doomed to fail. + Identity string +} + +func (r *FullSensorRecord) DecodeFromBytes(data []uint8) error { + if len(data) < 51 { + return fmt.Errorf("full sensor records are at least 51 bytes long, got %v", + len(data)) + } + + r.Number = data[10] + + r.BaseUnit = SensorUnit(data[24]) + r.ModifierUnit = SensorUnit(data[25]) + + r.Linearisation = Linearisation(data[26] & 0x7f) + + buf := [...]byte{data[28] >> 6, data[27]} + r.M = twos(buf, 10) + r.Tolerance = data[28] & 0x3f + buf[1] = data[29] + buf[0] = data[30] >> 6 + r.B = twos(buf, 10) + buf[1] = data[30]&0x3f | ((data[31] & 0xf0) << 2) + buf[0] = (data[31] & 0xf0) >> 6 + r.Accuracy = twos(buf, 10) + r.AccuracyExp = (data[31] & 0xc) >> 2 + buf[0] = 0 + buf[1] = data[32] >> 4 + r.RExp = int8(twos(buf, 4)) //nolint:gosec + buf[1] = data[32] & 0xf + r.BExp = int8(twos(buf, 4)) //nolint:gosec + + encoding := StringEncoding(data[50] >> 6) + + decoder, err := encoding.Decoder() + if err != nil { + // unsupported encoding; fail loudly so we can fix this + return err + } + + characters := int(data[50] & 0x1f) + + identity, _, err := decoder.Decode(data[51:], characters) + if err != nil { + // invalid bytes + return err + } + + r.Identity = identity + + return nil +} + +// bcdPlus defines the mappings of BCD plus nibbles to runes, specified in +// 37.15 and 43.15 of v1.5 and v2.0 respectively. An N byte string consists +// of 2N characters. +var bcdPlusRunes = [16]rune{ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + ' ', '-', '.', ':', ',', '_', +} + +// StringDecoder is implemented by things that know how to parse the final ID +// String field of full and compact SDRs. +type StringDecoder interface { + // Decode parses the first c characters (0 <= c <= 30) in b in the expected + // format (N.B. this could be a varying number of bytes depending on the + // encoding), returning the resulting string and number of bytes consumed, + // or an error if the data is too short or invalid. + // + // c was implemented as an int rather than uint8 to reduce the number of + // conversions required. + Decode(b []byte, c int) (string, int, error) +} + +// StringDecoderFunc eases implementation of stateless StringDecoders. +type StringDecoderFunc func([]byte, int) (string, int, error) + +// Decode calls the contained function on the inputs, passing through the +// returned values verbatim. +func (f StringDecoderFunc) Decode(b []byte, c int) (string, int, error) { + return f(b, c) +} + +// StringEncoding describes the most significant two bits of the SDR Type/Length +// Byte, specified in 37.15 and 43.15 of v1.5 and v2.0 respectively. +type StringEncoding uint8 + +const ( + // StringEncodingUnicode, contrary to the name, typically suggests an + // unspecified encoding. IPMItool displays a hex representation of the + // underlying bytes, while OpenIPMI interprets it identically to + // StringEncoding8BitAsciiLatin1. Given Unicode is only a character set and + // the spec does not suggest any encoding, there is no right answer. The + // resulting variety of implementations means use of this value by a BMC + // should be regarded as a bug. + StringEncodingUnicode StringEncoding = iota + StringEncodingBCDPlus + StringEncodingPacked6BitAscii + StringEncoding8BitAsciiLatin1 +) + +var ( + stringEncodingDescriptions = map[StringEncoding]string{ + StringEncodingUnicode: "Unicode", + StringEncodingBCDPlus: "BCD plus", + StringEncodingPacked6BitAscii: "6-bit ASCII, packed", + StringEncoding8BitAsciiLatin1: "8-bit ASCII + Latin 1", + } + // to ease readability and testability. + stringEncodingDecoders = map[StringEncoding]StringDecoder{ + // despite the ambiguity of StringEncodingUnicode, we follow OpenIPMI + // and decode it as 8-bit ASCII + StringEncodingUnicode: StringDecoderFunc(decode8BitAsciiLatin1), + StringEncodingBCDPlus: StringDecoderFunc(decodeBCDPlus), + StringEncodingPacked6BitAscii: StringDecoderFunc(decodePacked6BitAscii), + StringEncoding8BitAsciiLatin1: StringDecoderFunc(decode8BitAsciiLatin1), + } +) + +func (e StringEncoding) Decoder() (StringDecoder, error) { + if decoder, ok := stringEncodingDecoders[e]; ok { + return decoder, nil + } + + return nil, fmt.Errorf("no decoder found for encoding %v", e) +} + +func (e StringEncoding) Description() string { + if desc, ok := stringEncodingDescriptions[e]; ok { + return desc + } + + return unknown +} + +func (e StringEncoding) String() string { + return fmt.Sprintf("%#v(%v)", uint8(e), e.Description()) +} + +func decodeBCDPlus(b []byte, c int) (string, int, error) { + // each byte contains 2 characters (1 per nibble), so the number of + // bytes we expect equals half the number of characters, rounded up + bytes := int(math.Ceil(float64(c) / 2)) + if len(b) < bytes { + return "", 0, fmt.Errorf("expected %v bytes, got %v", bytes, len(b)) + } + + runes := make([]rune, c) + + for i := range c { + shift := uint8(0) + if i%2 == 0 { + // character is in the most significant 4 bits; need to + // shift down + shift = 4 + } + + runes[i] = bcdPlusRunes[(b[i/2]>>shift)&0xf] + } + + return string(runes), bytes, nil +} + +func decodePacked6BitAscii(b []byte, c int) (string, int, error) { + // the minimum number of bytes required to represent c characters; c does + // not have to be a multiple of 4 + bytes := c - (c / 4) + if len(b) < bytes { + return "", 0, fmt.Errorf("expected %v bytes, got %v", bytes, len(b)) + } + + runes := make([]rune, c) + acc := uint8(0) + + for i := range c { + // offset is the start offset for the first ASCII bits of the char at i; + // this formula required a bit of experimentation in Excel. N.B. cannot + // remove math.Floor() as need to round towards 0, not just strip the + // fractional component. + offset := (i - 1) - int(math.Floor(float64(i-1)/4)) + + // the switch extracts the appropriate 6 bits into acc (most significant + // two bits will always be 0) + switch i % 4 { + case 0: + // least significant 6 bits at offset + acc = b[offset] & 0x3f + case 1: + // least sig 2 bits: most sig 2 bits at offset + // most sig 4 bits: least sig 4 bits at offset + 1 + acc = b[offset] >> 6 + acc |= (b[offset+1] & 0xf) << 2 + case 2: + // least sig 4 bits: most sig 4 bits at offset + // most sig 2 bits: least sig 2 bits at offset + 1 + acc = b[offset] >> 4 + acc |= (b[offset+1] & 0x3) << 4 + case 3: + // most sig 6 bits at offset + acc = b[offset] >> 2 + } + + // observe character corresponding to code + runes[i] = rune(acc + 0x20) + } + + return string(runes), bytes, nil +} + +func decode8BitAsciiLatin1(b []byte, c int) (string, int, error) { + if len(b) < 2 { + // it is unclear why this limitation exists, but it's plain to + // see in the specification + return "", 0, fmt.Errorf("at least 2 bytes of data must be present; got %v bytes", len(b)) + } + + // bounds check to ensure the slicing below does not panic + if len(b) < c { + return "", 0, fmt.Errorf("expected %v bytes, got %v", c, len(b)) + } + + // can convert straight into a string as the encoding's range is + // identical to UTF-8 + return string(b[:c]), c, nil +} + +const ( + LinearisationLinear Linearisation = iota + LinearisationLn + LinearisationLog10 + LinearisationLog2 + LinearisationE + LinearisationExp10 + LinearisationExp2 + LinearisationInverse + LinearisationSqr + LinearisationCube + LinearisationSqrt + LinearisationCubeRt + LinearisationNonLinear + + // 0x71 through 0x7f are reserved for non-linear, OEM defined + // linearisations. It is unclear why these cannot use + // LinearisationNonLinear, as being non-linear, they do not have a + // linearisation formula. Waiting for a use case to emerge rather than + // implementing a questionably useful RegisterLineariser() function. +) + +var ( + // ErrNotLinearised is returned if Lineariser() is called on a linear or + // non-linear linearisation. Linear sensors' values do not require any + // transformation by virtue of the sensor already being linear. If the sensor + // is non-linear, the conversion factors returned by Get Sensor Reading + // Factors are all that are needed to obtain a real value: by being unique + // to the raw sensor reading, there is no need for a separate linearisation + // formula. + // + // Linearise() could return a no-op lineariser, however the current + // implementation should never ask for one on a non-linearised sensor, so + // instead we return an error to flag up a possible bug. + ErrNotLinearised = errors.New( + "only linearised sensors have a linearisation formula") + + linearisationDescriptions = map[Linearisation]string{ + LinearisationLinear: "Linear", + LinearisationLn: "ln", + LinearisationLog10: "log10", + LinearisationLog2: "log2", + LinearisationE: "e", + LinearisationExp10: "exp10", + LinearisationExp2: "exp2", + LinearisationInverse: "1/x", + LinearisationSqr: "sqr(x)", + LinearisationCube: "cube(x)", + LinearisationSqrt: "sqrt(x)", + LinearisationCubeRt: "x^(1/3)", + LinearisationNonLinear: "Non-linear", + } + + // linearisationLinearisers allows us to find out what linearisation formula + // needs to be applied to the converted output of a linearised sensor, to + // produce a real value. Note that linear and non-linear linearisations do + // not appear here as they don't need a linearisation formula. + linearisationLinearisers = map[Linearisation]Lineariser{ + LinearisationLn: LineariserFunc(math.Log), + LinearisationLog10: LineariserFunc(math.Log10), + LinearisationLog2: LineariserFunc(math.Log2), + LinearisationE: LineariserFunc(math.Exp), + LinearisationExp10: LineariserFunc(func(f float64) float64 { + // cannot use math.Pow10 as that takes an int + return math.Pow(10, f) + }), + LinearisationExp2: LineariserFunc(math.Exp2), + LinearisationInverse: LineariserFunc(func(f float64) float64 { + return math.Pow(f, -1) + }), + LinearisationSqr: LineariserFunc(func(f float64) float64 { + return f * f + }), + LinearisationCube: LineariserFunc(func(f float64) float64 { + return f * f * f + }), + LinearisationSqrt: LineariserFunc(math.Sqrt), + LinearisationCubeRt: LineariserFunc(func(f float64) float64 { + return math.Pow(f, 1./3) + }), + } +) + +// Linearisation indicates whether a sensor is linear, linearised, or +// non-linear. Values are specified in the Full Sensor Record wire format table +// in 37-1 and 43-1 of v1.5 and v2.0 respectively. +// +// Linear sensors are the easiest to deal with. The sensor's raw readings are +// converted into real readings (e.g. Celsius) with a linear formula. Accuracy +// and resolution are constant in real terms across the entire range of values +// produced by the sensor. +// +// Linearised are slightly more challenging. The same linear formula is applied +// as for linear sensors, however a final "linearisation formula" is applied to +// obtain the real reading. This transformation is one of 11 defined in the +// spec, e.g. log or sqrt, and obviously does not have to be linear itself. The +// tolerance (the spec misuses accuracy as a synonym) of linearised sensors is +// also constant for all values. This is possible despite the existence of the +// linearisation formula turning raw values into disproportionate real values, +// as tolerance is expressed relative to 0. This assumes the sensor's tolerance +// does not diminish in real, absolute terms at extreme values (positive or +// negative), as there is no way of representing it (you'd have to resort to +// declaring it a non-linear sensor). Note that tolerance can only be expressed +// in half-raw value increments, which is in itself quite coarse. Regarding +// resolution, this will vary with reading due to the linearisation formula. The +// recommended way to calculate it is to retrieve and calculate the real values +// (with the help of Get Sensor Reading Factors as necessary) corresponding to +// the raw values below and above the actual raw value observed. Subtracting the +// real reading for the raw value below the observed raw value from the real +// reading for the observed value gives the negative resolution, and the process +// is equivalent for the positive resolution using the raw value one above. +// +// All consistency bets are off with non-linear sensors. Not only does +// resolution vary by reading (calculated in the same was as for linearised +// sensors), but so does tolerance. Get Sensor Reading Factors must be sent with +// each raw reading; applying the linear formula using the returned conversion +// factors yields the real reading, and can the same factors can be plugged into +// the tolerance and resolution formulae to calculate them. +type Linearisation uint8 + +// IsLinear returns whether the underlying sensor is linear. Calling +// Lineariser() will return an error, as there is no linearisation formula (it +// is effectively a no-op). Only the linear formula in the spec needs be applied +// to obtain a real reading. +func (l Linearisation) IsLinear() bool { + return l == LinearisationLinear +} + +// IsLinearised returns whether the underlying sensor is linearised, meaning the +// value after conversion needs to be fed through a linearisation formula as a +// final step before being used. A suitable implementation of this function is +// returned by the Lineariser() method. +func (l Linearisation) IsLinearised() bool { + return l > LinearisationLinear && l < LinearisationNonLinear +} + +// IsNonLinear returns whether the underlying sensor is not consistent enough +// for the constraints of linear and linearised. As for linear sensors, +// attempting to retrieve a Lineariser will return an error. Readings from these +// sensors require Get Sensor Reading Factors to convert them into usable +// values. +func (l Linearisation) IsNonLinear() bool { + return l >= LinearisationNonLinear +} + +// Lineariser returns a suitable Lineariser implementation that will turn the +// converted raw value produced by the underlying sensor into a usable value. If +// the sensor is already linear, or non-linear, this will return +// ErrNotLinearised. +func (l Linearisation) Lineariser() (Lineariser, error) { + if lineariser, ok := linearisationLinearisers[l]; ok { + return lineariser, nil + } + + return nil, ErrNotLinearised +} + +func (l Linearisation) Description() string { + if desc, ok := linearisationDescriptions[l]; ok { + return desc + } + + if l >= 0x71 && l <= 0x7f { + return "Non-linear OEM" + } + + return unknown +} + +func (l Linearisation) String() string { + return fmt.Sprintf("%#v(%v)", uint8(l), l.Description()) +} + +// Lineariser is implemented by formulae that can linearise a value returned by +// the Get Sensor Reading command that has gone through the linear formula +// containing M, B, K1 and K2, used for all sensors. +type Lineariser interface { + // Linearise applies a linearisation formula to a converted value, returning + // the final value in the correct unit. This is the last step in the "Sensor + // Reading Conversion Formula" described in section 30.3 of IPMI v1.5 and + // v2.0. + Linearise(v float64) float64 +} + +// LineariserFunc is the type of the function in the Lineariser interface. It +// allows us to create stateless Lineariser implementations from raw functions, +// including those in the math package. +type LineariserFunc func(float64) float64 + +// Linearise invokes the wrapped function, passing through the input and result. +func (l LineariserFunc) Linearise(f float64) float64 { + return l(f) +} + +// SensorUnit defines the unit of a sensor. It is specified in 37.17 and 43.17 +// of v1.5 and v2.0 respectively. It is an 8-bit uint on the wire. +type SensorUnit uint8 + +const ( + _ SensorUnit = iota + SensorUnitCelsius + SensorUnitFahrenheit + SensorUnitKelvin + SensorUnitVolts + SensorUnitAmps + SensorUnitWatts + SensorUnitJoules + SensorUnitCoulombs + SensorUnitVoltamperes + SensorUnitNits + SensorUnitLumen + SensorUnitLux + SensorUnitCandela + SensorUnitKilopascals + SensorUnitPoundsPerSquareInch + SensorUnitNewtons + SensorUnitCubicFeetPerMinute + SensorUnitRotationsPerMinute + SensorUnitHertz + SensorUnitMicroseconds + SensorUnitMilliseconds + SensorUnitSeconds + SensorUnitMinutes + SensorUnitHours + SensorUnitDays + SensorUnitWeeks + SensorUnitMils + SensorUnitInches + SensorUnitFeet + SensorUnitCubicInches + SensorUnitCubicFeet + SensorUnitMillimeters + SensorUnitCentimeters + SensorUnitMeters + SensorUnitCubicCentimeters + SensorUnitCubicMeters + SensorUnitLiters + SensorUnitFluidOunces + SensorUnitRadians + SensorUnitSteradians + SensorUnitRevolutions + SensorUnitCycles + SensorUnitGravities + SensorUnitOunces + SensorUnitPounds + SensorUnitFeetPounds + SensorUnitOunceInches + SensorUnitGauss + SensorUnitGilberts + SensorUnitHenry + SensorUnitMillihenry + SensorUnitFarad + SensorUnitMicrofarad + SensorUnitOhms + SensorUnitSiemens + SensorUnitMoles + SensorUnitBecquerel + SensorUnitPartsPerMillion + _ + SensorUnitDecibels + SensorUnitDecibelsAFilter + SensorUnitDecibelsCFilter + SensorUnitGray + SensorUnitSieverts + SensorUnitColorTempKelvin + SensorUnitBits + SensorUnitKilobits + SensorUnitMegabits + SensorUnitGigabits + SensorUnitBytes + SensorUnitKilobytes + SensorUnitMegabytes + SensorUnitGigabytes + SensorUnitWords + SensorUnitDwords + SensorUnitQwords + SensorUnitMemoryLines + SensorUnitHits + SensorUnitMisses + SensorUnitRetries + SensorUnitResets + SensorUnitOverflows + SensorUnitUnderruns + SensorUnitCollisions + SensorUnitPackets + SensorUnitMessages + SensorUnitCharacters + SensorUnitErrors + SensorUnitCorrectableErrors + SensorUnitUncorrectableErrors + SensorUnitFatal + SensorUnitGrams +) + +var sensorUnitSymbols = map[SensorUnit]string{ + SensorUnitCelsius: "C", + SensorUnitFahrenheit: "F", + SensorUnitKelvin: "K", + SensorUnitVolts: "V", + SensorUnitAmps: "A", + SensorUnitWatts: "W", + SensorUnitJoules: "J", + SensorUnitCoulombs: "C", + SensorUnitVoltamperes: "VA", + SensorUnitNits: "nt", + SensorUnitLumen: "lm", + SensorUnitLux: "lx", + SensorUnitCandela: "cd", + SensorUnitKilopascals: "kPa", + SensorUnitPoundsPerSquareInch: "psi", + SensorUnitNewtons: "nt", + SensorUnitCubicFeetPerMinute: "CFM", + SensorUnitRotationsPerMinute: "RPM", + SensorUnitHertz: "Hz", + SensorUnitMicroseconds: "μs", + SensorUnitMilliseconds: "ms", + SensorUnitSeconds: "s", + SensorUnitMinutes: "min", + SensorUnitHours: "hr", + SensorUnitDays: "d", + SensorUnitWeeks: "w", + SensorUnitMils: "mil", + SensorUnitInches: "in", + SensorUnitFeet: "ft", + SensorUnitCubicInches: "in³", + SensorUnitCubicFeet: "ft³", + SensorUnitMillimeters: "mm", + SensorUnitCentimeters: "cm", + SensorUnitMeters: "m", + SensorUnitCubicCentimeters: "cm³", + SensorUnitCubicMeters: "m³", + SensorUnitLiters: "l", + SensorUnitFluidOunces: "fl oz", + SensorUnitRadians: "rad", + SensorUnitSteradians: "sr", + SensorUnitRevolutions: "rev", + SensorUnitCycles: "Hz", + SensorUnitGravities: "g", + SensorUnitOunces: "oz", + SensorUnitPounds: "lb", + SensorUnitFeetPounds: "ft-lb", + SensorUnitOunceInches: "oz-in", + SensorUnitGauss: "G", + SensorUnitGilberts: "Gb", + SensorUnitHenry: "H", + SensorUnitMillihenry: "mH", + SensorUnitFarad: "F", + SensorUnitMicrofarad: "μF", + SensorUnitOhms: "Ω", + SensorUnitSiemens: "Ω⁻¹", + SensorUnitMoles: "mol", + SensorUnitBecquerel: "Bq", + SensorUnitPartsPerMillion: "ppm", + SensorUnitDecibels: "dB", + SensorUnitDecibelsAFilter: "dBA", + SensorUnitDecibelsCFilter: "dBC", + SensorUnitGray: "Gy", + SensorUnitSieverts: "Sv", + SensorUnitColorTempKelvin: "ColorK", + SensorUnitBits: "b", + SensorUnitKilobits: "Kb", + SensorUnitMegabits: "Mb", + SensorUnitGigabits: "Gb", + SensorUnitBytes: "B", + SensorUnitKilobytes: "KB", + SensorUnitMegabytes: "MB", + SensorUnitGigabytes: "GB", + SensorUnitWords: "word", + SensorUnitDwords: "dword", + SensorUnitQwords: "qword", + SensorUnitMemoryLines: "memory line", + SensorUnitHits: "hit", + SensorUnitMisses: "miss", + SensorUnitRetries: "retry", + SensorUnitResets: "reset", + SensorUnitOverflows: "overflow", + SensorUnitUnderruns: "underrun", + SensorUnitCollisions: "collision", + SensorUnitPackets: "pkt", + SensorUnitMessages: "msg", + SensorUnitCharacters: "char", + SensorUnitErrors: "err", + SensorUnitCorrectableErrors: "correctable err", + SensorUnitUncorrectableErrors: "uncorrectable err", + SensorUnitFatal: "fatal", + SensorUnitGrams: "g", +} + +func (s SensorUnit) Symbol() string { + if s == 0 { + return "Unspecified/Unused" + } + + if symbol, ok := sensorUnitSymbols[s]; ok { + return symbol + } + + return unknown +} + +func (s SensorUnit) String() string { + return s.Symbol() +} + +// twos parses two's complement numbers of up to 16 bits into a native integer. +// The input is two bytes in big-endian order, and the number of bits the binary +// representation is expected to be (0 through 16). More significant bits above +// this must be 0, e.g. twos([...]byte{0b000000xx, 0bxxxxxxxx}, 10). +func twos(bigEndian [2]byte, bits uint8) int16 { + // this abstracts away the endian-ness of the platform; big-endian only + // refers to the byte order of the input. It is identical to + // binary.BigEndian.Uint16(), but avoids creating a slice. + numerical := uint16(bigEndian[1]) | uint16(bigEndian[0])<<8 + + // sign extend to 16 bits + // (https://graphics.stanford.edu/~seander/bithacks.html, "Sign extending + // from a variable bit-width") + mask := uint16(1) << (uint16(bits) - 1) + numerical = (numerical ^ mask) - mask + + // make signed - same underlying bits, just different type + return int16(numerical) //nolint:gosec } diff --git a/scripts/e2e-test.sh b/scripts/e2e-test.sh index 5b071f5a..51d5c563 100755 --- a/scripts/e2e-test.sh +++ b/scripts/e2e-test.sh @@ -521,9 +521,9 @@ then --collector.slurm.gres-config-file="pkg/collector/testdata/gres.conf" \ --collector.gpu.type="nvidia" \ --collector.gpu.nvidia-smi-path="pkg/collector/testdata/nvidia-smi" \ - --collector.ipmi_dcmi \ - --collector.ipmi_dcmi.test-mode \ - --collector.ipmi_dcmi.cmd="pkg/collector/testdata/ipmi/freeipmi/ipmi-dcmi" \ + --collector.ipmi \ + --collector.ipmi.test-mode \ + --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/freeipmi/ipmi-dcmi" \ --collector.redfish \ --collector.redfish.web-config="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ @@ -545,9 +545,9 @@ then --collector.slurm.gres-config-file="pkg/collector/testdata/gres.conf" \ --collector.gpu.type="nvidia" \ --collector.gpu.nvidia-smi-path="pkg/collector/testdata/nvidia-smi" \ - --collector.ipmi_dcmi \ - --collector.ipmi_dcmi.cmd="pkg/collector/testdata/ipmi/freeipmi/ipmi-dcmi" \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi \ + --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/freeipmi/ipmi-dcmi" \ + --collector.ipmi.test-mode \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -567,8 +567,8 @@ then --collector.rdma.stats \ --collector.rdma.cmd="pkg/collector/testdata/rdma" \ --collector.empty-hostname-label \ - --collector.ipmi_dcmi \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi \ + --collector.ipmi.test-mode \ --collector.redfish \ --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ @@ -592,8 +592,8 @@ then --collector.rdma.stats \ --collector.rdma.cmd="pkg/collector/testdata/rdma" \ --collector.empty-hostname-label \ - --collector.ipmi_dcmi \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi \ + --collector.ipmi.test-mode \ --collector.cray_pm_counters \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -611,8 +611,8 @@ then --collector.gpu.rocm-smi-path="pkg/collector/testdata/rocm-smi" \ --collector.empty-hostname-label \ --collector.hwmon \ - --collector.ipmi_dcmi \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi \ + --collector.ipmi.test-mode \ --collector.redfish \ --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ @@ -630,8 +630,8 @@ then --collector.slurm \ --collector.gpu.type="nogpu" \ --collector.empty-hostname-label \ - --collector.ipmi_dcmi \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi \ + --collector.ipmi.test-mode \ --collector.infiniband \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -647,9 +647,9 @@ then --collector.slurm \ --collector.gpu.type="nvidia" \ --collector.gpu.nvidia-smi-path="pkg/collector/testdata/nvidia-smi" \ - --collector.ipmi_dcmi \ + --collector.ipmi \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/ipmiutils/ipmiutil" \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi.test-mode \ --collector.cray_pm_counters \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ @@ -668,9 +668,9 @@ then --collector.gpu.rocm-smi-path="pkg/collector/testdata/rocm-smi" \ --collector.slurm.swap.memory.metrics \ --collector.slurm.psi.metrics \ - --collector.ipmi_dcmi \ + --collector.ipmi \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi.test-mode \ --collector.redfish \ --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ @@ -693,9 +693,9 @@ then --collector.libvirt.swap-memory-metrics \ --collector.libvirt.psi-metrics \ --collector.libvirt.blkio-metrics \ - --collector.ipmi_dcmi \ + --collector.ipmi \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi.test-mode \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -714,9 +714,9 @@ then --collector.libvirt.swap-memory-metrics \ --collector.libvirt.psi-metrics \ --collector.libvirt.blkio-metrics \ - --collector.ipmi_dcmi \ + --collector.ipmi \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi.test-mode \ --collector.redfish \ --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ @@ -738,9 +738,9 @@ then --collector.libvirt.swap-memory-metrics \ --collector.libvirt.psi-metrics \ --collector.libvirt.blkio-metrics \ - --collector.ipmi_dcmi \ + --collector.ipmi \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi.test-mode \ --collector.redfish \ --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ @@ -760,9 +760,9 @@ then --collector.k8s.kubelet-socket-file="${CEEMS_KUBELET_SOCKET_DIR}/nvidia/kubelet.sock" \ --collector.gpu.type="nvidia" \ --collector.gpu.nvidia-smi-path="pkg/collector/testdata/nvidia-smi" \ - --collector.ipmi_dcmi \ + --collector.ipmi \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi.test-mode \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -779,9 +779,9 @@ then --collector.k8s.kubelet-socket-file="${CEEMS_KUBELET_SOCKET_DIR}/amd/kubelet.sock" \ --collector.gpu.type="amd" \ --collector.gpu.amd-smi-path="pkg/collector/testdata/amd-smi" \ - --collector.ipmi_dcmi \ + --collector.ipmi \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi.test-mode \ --collector.redfish \ --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ @@ -799,9 +799,9 @@ then --collector.gpu.type="nogpu" \ --collector.k8s \ --collector.k8s.kube-config-file="pkg/collector/testdata/k8s/kubeconfig.yml" \ - --collector.ipmi_dcmi \ + --collector.ipmi \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi.test-mode \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -816,9 +816,9 @@ then --collector.cgroups.force-version="v2" \ --collector.slurm \ --collector.gpu.type="nogpu" \ - --collector.ipmi_dcmi \ + --collector.ipmi \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi.test-mode \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -833,9 +833,9 @@ then --collector.slurm \ --collector.gpu.type="nogpu" \ --collector.cgroups.force-version="v1" \ - --collector.ipmi_dcmi \ + --collector.ipmi \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ - --collector.ipmi_dcmi.test-mode \ + --collector.ipmi.test-mode \ --collector.redfish \ --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ @@ -1521,9 +1521,9 @@ then --collector.cgroups.force-version="v1" \ --collector.slurm \ --collector.gpu.type="nogpu" \ - --collector.ipmi_dcmi \ - --collector.ipmi_dcmi.test-mode \ - --collector.ipmi_dcmi.cmd="pkg/collector/testdata/ipmi/freeipmi/ipmi-dcmi" \ + --collector.ipmi \ + --collector.ipmi.test-mode \ + --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/freeipmi/ipmi-dcmi" \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:9010" \ --web.disable-exporter-metrics \ @@ -1603,9 +1603,9 @@ then --collector.slurm \ --collector.gpu.type="nvidia" \ --collector.gpu.nvidia-smi-path="pkg/collector/testdata/nvidia-smi" \ - --collector.ipmi_dcmi \ - --collector.ipmi_dcmi.test-mode \ - --collector.ipmi_dcmi.cmd="pkg/collector/testdata/ipmi/openipmi/ipmitool" \ + --collector.ipmi \ + --collector.ipmi.test-mode \ + --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/openipmi/ipmitool" \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:9015" \ --web.disable-exporter-metrics \ diff --git a/thirdparty/grafana/dashboards/admin/cluster-status.json b/thirdparty/grafana/dashboards/admin/cluster-status.json index 7e6df461..46402491 100644 --- a/thirdparty/grafana/dashboards/admin/cluster-status.json +++ b/thirdparty/grafana/dashboards/admin/cluster-status.json @@ -2058,6 +2058,10 @@ { "allValue": ".*", "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, "definition": "label_values(job)", "description": "Prometheus Scrape Job", "label": "Scrape Job:", @@ -2132,7 +2136,7 @@ }, "timezone": "", "title": "Cluster Status", - "uid": "cldpmdfp36n2tcb", + "uid": "ceems-cluster-status", "version": 2, "weekStart": "" } \ No newline at end of file diff --git a/thirdparty/grafana/dashboards/k8s/k8s-pod-summary.json b/thirdparty/grafana/dashboards/k8s/k8s-pod-summary.json index 2485fcb1..5bf5488c 100644 --- a/thirdparty/grafana/dashboards/k8s/k8s-pod-summary.json +++ b/thirdparty/grafana/dashboards/k8s/k8s-pod-summary.json @@ -2272,6 +2272,7 @@ }, "timezone": "", "title": "User k8s Pod Summary", + "uid": "k8s-pod-summary", "version": 1, "weekStart": "" } \ No newline at end of file diff --git a/thirdparty/grafana/dashboards/openstack/os-vm-summary.json b/thirdparty/grafana/dashboards/openstack/os-vm-summary.json index 63c99bcb..1f41818f 100644 --- a/thirdparty/grafana/dashboards/openstack/os-vm-summary.json +++ b/thirdparty/grafana/dashboards/openstack/os-vm-summary.json @@ -2270,6 +2270,7 @@ }, "timezone": "", "title": "User Openstack VM Summary", + "uid": "os-vm-summary", "version": 1, "weekStart": "" } \ No newline at end of file diff --git a/thirdparty/grafana/dashboards/slurm/slurm-job-summary.json b/thirdparty/grafana/dashboards/slurm/slurm-job-summary.json index d69c4848..dd4ff42a 100644 --- a/thirdparty/grafana/dashboards/slurm/slurm-job-summary.json +++ b/thirdparty/grafana/dashboards/slurm/slurm-job-summary.json @@ -2290,6 +2290,7 @@ }, "timezone": "", "title": "User SLURM Job Summary", + "uid": "slurm-job-summary", "version": 1, "weekStart": "" } \ No newline at end of file diff --git a/website/.gitignore b/website/.gitignore index e0ede87a..f510ef8d 100644 --- a/website/.gitignore +++ b/website/.gitignore @@ -20,3 +20,8 @@ docs/api/*.ts npm-debug.log* yarn-debug.log* yarn-error.log* + +# Ignore copied files +docs/97-SECURITY.md +docs/98-CONTRIBUTING.md +docs/99-CHANGELOG.md diff --git a/website/cspell.json b/website/cspell.json index d78e82af..8d7d2b5b 100644 --- a/website/cspell.json +++ b/website/cspell.json @@ -79,7 +79,8 @@ "OCC", "kubeconfig", "kubeflow", - "myrules" + "myrules", + "elist" ], // flagWords - list of words to be always considered incorrect // This is useful for offensive words and common spelling errors. diff --git a/website/docs/configuration/ceems-exporter.md b/website/docs/configuration/ceems-exporter.md index 8acb7c50..97789f08 100644 --- a/website/docs/configuration/ceems-exporter.md +++ b/website/docs/configuration/ceems-exporter.md @@ -309,8 +309,10 @@ for getting deviceIDs attached to each pod. This is discussed more in detail in :::important[IMPORTANT] +In version `0.11.0`, this collector has been renamed to `--collector.ipmi`. + From version `0.5.0`, this collector is disabled by default and it -must be explicitly enabled using the CLI flag `--collector.ipmi_dcmi` +must be explicitly enabled using the CLI flag `--collector.ipmi` ::: @@ -327,7 +329,7 @@ implementation of the IPMI protocol using the Currently, this mode is only used as a fallback when no third-party libraries are found on the host. However, the pure Go implementation is more performant than calling third-party libraries in a sub-process and it should be preferred over other methods. -Users can force this mode by passing the CLI flag `--collector.ipmi_dcmi.force-native-mode` +Users can force this mode by passing the CLI flag `--collector.ipmi.force-native-mode` ::: @@ -335,7 +337,7 @@ Thus, in order to enable and force the IPMI collector in native mode, the follow must be passed to the exporter: ```bash -ceems_exporter --collector.ipmi_dcmi --collector.ipmi_dcmi.force-native-mode +ceems_exporter --collector.ipmi --collector.ipmi.force-native-mode ``` Generally, `ipmi` related commands are available only for `root`. More on the privileges @@ -349,6 +351,16 @@ might not include the power consumption of GPUs. ::: +When native mode is enabled, IPMI collector is capable of exporting sensor reading +metrics as well. When flag `--collector.ipmi.power-energy-sensor-readings` is enabled, +the collector will automatically detect the sensors that have units of Watts or Joules +and exports the readings of those sensors to Prometheus. In case the user wants to +export metrics of sensors other than power and energy, it can be configured using the +CLI flag `--collector.ipmi.sensor-id` (it can be repeated to configure multiple sensors). +The sensor IDs can be fetched using commands like `ipmitool sdr elist` or `ipmi-sensors -v`. +Note that sensor IDs are presented in hex in ths output of these commands and they must be +converted to decimal in the CLI flag `--collector.ipmi.sensor-id`. + ### Redfish collector Redfish exposes the BMC related telemetry data using a REST API server. Thus, this diff --git a/website/package.json b/website/package.json index 82a6f98d..4aa88acc 100644 --- a/website/package.json +++ b/website/package.json @@ -4,8 +4,8 @@ "private": true, "scripts": { "docusaurus": "docusaurus", - "start": "yarn re-gen && docusaurus start", - "build": "yarn re-gen && docusaurus build", + "start": "yarn copy && yarn re-gen && docusaurus start", + "build": "yarn copy && yarn re-gen && docusaurus build", "swizzle": "docusaurus swizzle", "deploy": "docusaurus deploy", "clear": "docusaurus clear", @@ -15,6 +15,7 @@ "gen-all": "docusaurus gen-api-docs all --all-versions", "clean-all": "docusaurus clean-api-docs all --all-versions", "re-gen": "yarn clean-all && yarn gen-all", + "copy": "cp ../SECURITY.md docs/97-SECURITY.md && cp ../CONTRIBUTING.md docs/98-CONTRIBUTING.md && cp ../CHANGELOG.md docs/99-CHANGELOG.md", "typecheck": "tsc", "spellcheck": "cspell docs/**/*.md docs/*.md ../etc/**/*.md ../README.md", "linkcheck": "markdown-link-check -c md-link-check.json docs/**/*.md docs/*.md ../etc/**/*.md ../README.md", diff --git a/website/yarn.lock b/website/yarn.lock index aeeaff72..d1d9a578 100644 --- a/website/yarn.lock +++ b/website/yarn.lock @@ -3998,15 +3998,10 @@ caniuse-api@^3.0.0: lodash.memoize "^4.1.2" lodash.uniq "^4.5.0" -caniuse-lite@^1.0.0, caniuse-lite@^1.0.30001646, caniuse-lite@^1.0.30001669: - version "1.0.30001684" - resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001684.tgz#0eca437bab7d5f03452ff0ef9de8299be6b08e16" - integrity sha512-G1LRwLIQjBQoyq0ZJGqGIJUXzJ8irpbjHLpVRXDvBEScFJ9b17sgK6vlx0GAJFE21okD7zXl08rRRUfq6HdoEQ== - -caniuse-lite@^1.0.30001702, caniuse-lite@^1.0.30001718: - version "1.0.30001721" - resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001721.tgz#36b90cd96901f8c98dd6698bf5c8af7d4c6872d7" - integrity sha512-cOuvmUVtKrtEaoKiO0rSc29jcjwMwX5tOHDy4MgVFEWiUXj4uBMJkwI8MDySkgXidpMiHUcviogAvFi4pA2hDQ== +caniuse-lite@^1.0.0, caniuse-lite@^1.0.30001646, caniuse-lite@^1.0.30001669, caniuse-lite@^1.0.30001702, caniuse-lite@^1.0.30001718: + version "1.0.30001734" + resolved "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001734.tgz" + integrity sha512-uhE1Ye5vgqju6OI71HTQqcBCZrvHugk0MjLak7Q+HfoBgoq5Bi+5YnwjP4fjDgrtYr/l8MVRBvzz9dPD4KyK0A== ccount@^2.0.0: version "2.0.1" @@ -10790,16 +10785,7 @@ std-env@^3.7.0: resolved "https://registry.yarnpkg.com/std-env/-/std-env-3.8.0.tgz#b56ffc1baf1a29dcc80a3bdf11d7fca7c315e7d5" integrity sha512-Bc3YwwCB+OzldMxOXJIIvC6cPRWr/LxOp48CdQTOkPyk/t4JWWJbrilwBd7RJzKV8QW7tJkcgAmeuLLJugl5/w== -"string-width-cjs@npm:string-width@^4.2.0": - version "4.2.3" - resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010" - integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g== - dependencies: - emoji-regex "^8.0.0" - is-fullwidth-code-point "^3.0.0" - strip-ansi "^6.0.1" - -string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3: +"string-width-cjs@npm:string-width@^4.2.0", string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3: version "4.2.3" resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010" integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g== @@ -10848,14 +10834,7 @@ stringify-object@^3.3.0: is-obj "^1.0.1" is-regexp "^1.0.0" -"strip-ansi-cjs@npm:strip-ansi@^6.0.1": - version "6.0.1" - resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9" - integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A== - dependencies: - ansi-regex "^5.0.1" - -strip-ansi@^6.0.0, strip-ansi@^6.0.1: +"strip-ansi-cjs@npm:strip-ansi@^6.0.1", strip-ansi@^6.0.0, strip-ansi@^6.0.1: version "6.0.1" resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9" integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A== @@ -11734,16 +11713,7 @@ wildcard@^2.0.0, wildcard@^2.0.1: resolved "https://registry.yarnpkg.com/wildcard/-/wildcard-2.0.1.tgz#5ab10d02487198954836b6349f74fff961e10f67" integrity sha512-CC1bOL87PIWSBhDcTrdeLo6eGT7mCFtrg0uIJtqJUFyK+eJnzl8A1niH56uu7KMa5XFrtiV+AQuHO3n7DsHnLQ== -"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0": - version "7.0.0" - resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43" - integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q== - dependencies: - ansi-styles "^4.0.0" - string-width "^4.1.0" - strip-ansi "^6.0.0" - -wrap-ansi@^7.0.0: +"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0", wrap-ansi@^7.0.0: version "7.0.0" resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43" integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==