From 6bed47c0633e44426e59c106ee315c9a053ad0be Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Thu, 28 Aug 2025 12:26:55 +0200 Subject: [PATCH] feat: Support runtime XML directory for libvirt collector * Seems like certain Openstack deployments might not create XML files for instances in persistent directory `/etc/libvirt/qemu`. So, we need to support looking into the runtime directory `/run/libvirt/qemu` in the collector. By default collector will look at both directories and file found in the first directory will be used to fetch instance properties. We always give priority to persistent folder over runtime folder while looking for files. * Unit tests have been modified to test the behaviour of XML files in runtime and persistent directories which have different XML schemas. Signed-off-by: Mahendra Paipuri --- pkg/collector/libvirt.go | 89 +++- pkg/collector/libvirt_test.go | 57 ++- .../testdata/qemu/instance-00000004.xml | 455 +++++++++++++----- website/docs/configuration/ceems-exporter.md | 9 + 4 files changed, 478 insertions(+), 132 deletions(-) diff --git a/pkg/collector/libvirt.go b/pkg/collector/libvirt.go index f7dc618d..4e627f5a 100644 --- a/pkg/collector/libvirt.go +++ b/pkg/collector/libvirt.go @@ -47,7 +47,7 @@ var ( libvirtXMLDir = CEEMSExporterApp.Flag( "collector.libvirt.xml-dir", "Directory containing XML files of instances", - ).Default("/etc/libvirt/qemu").Hidden().String() + ).Default("").Hidden().String() ) // Security context names. @@ -55,7 +55,28 @@ const ( libvirtReadXMLCtx = "libvirt_read_xml" ) -// Domain is the top level XML field for libvirt XML schema. +// XML folder locations +// There are two different locations: +// - /etc/libvirt/qemu - Persistent files +// - /run/libvirt/qemu - Runtime files +// Runtime files and persistent files have slightly different schemas +// so we need to handle them differently. Moreover runtime schema +// files will not be present when the VM is shut down. +// On some Openstack instances, nova might not create persistent files at all +// so we need to look at both runtime and persistent directories to find +// instance properties. +// +// More info: https://github.com/ceems-dev/ceems/discussions/402. +var ( + defaultLibvirtXMLDirs = []string{"/etc/libvirt/qemu", "/run/libvirt/qemu"} +) + +// Domstatus is the top level XML field for runtime XML files. +type DomStatus struct { + Domain Domain `xml:"domain"` +} + +// Domain is the top level XML field for persistent XML files. type Domain struct { Devices Devices `xml:"devices"` Name string `xml:"name"` @@ -100,7 +121,7 @@ type instanceProperties struct { // libvirtReadXMLSecurityCtxData contains the input/output data for // reading XML files inside a security context. type libvirtReadXMLSecurityCtxData struct { - xmlPath string + xmlDirs []string instanceID string devices []Device properties *instanceProperties @@ -114,6 +135,7 @@ type libvirtCollector struct { ebpfCollector *ebpfCollector rdmaCollector *rdmaCollector hostname string + libvirtXMLDirs []string gpuSMI *GPUSMI vGPUActivated bool instanceGpuFlag *prometheus.Desc @@ -230,6 +252,13 @@ func NewLibvirtCollector(logger *slog.Logger) (Collector, error) { } } + // In case XML files are stored in non-standard location, add them to + // paths + libvirtXMLDirs := defaultLibvirtXMLDirs + if *libvirtXMLDir != "" && !slices.Contains(libvirtXMLDirs, *libvirtXMLDir) { + libvirtXMLDirs = append(libvirtXMLDirs, *libvirtXMLDir) + } + // Setup necessary capabilities. These are the caps we need to read // XML files in /etc/libvirt/qemu folder that contains GPU devs used by guests. caps, err := setupAppCaps([]string{"cap_dac_read_search"}) @@ -259,6 +288,7 @@ func NewLibvirtCollector(logger *slog.Logger) (Collector, error) { perfCollector: perfCollector, ebpfCollector: ebpfCollector, rdmaCollector: rdmaCollector, + libvirtXMLDirs: libvirtXMLDirs, hostname: hostname, gpuSMI: gpuSMI, vGPUActivated: vGPUActivated, @@ -497,7 +527,7 @@ func (c *libvirtCollector) updateDeviceMappers(ch chan<- prometheus.Metric) { func (c *libvirtCollector) instanceProperties(instanceID string) *instanceProperties { // Read XML file in a security context that raises necessary capabilities dataPtr := &libvirtReadXMLSecurityCtxData{ - xmlPath: *libvirtXMLDir, + xmlDirs: c.libvirtXMLDirs, devices: c.gpuSMI.Devices, instanceID: instanceID, } @@ -578,6 +608,10 @@ func (c *libvirtCollector) updateDeviceInstances(cgroups []cgroup) { // We keep a map of instance ID to instance UUID to setup // UUIDs when this part of code is skipped + // NOTE: Never invalidate this cache as shut down instance's XML + // files might not be present in some cases but their cgroups will be + // always present. In that case we might not be able to get UUID of those + // shutdown instances if we invalidate the cache. c.instanceIDUUIDMap[cgrp.id] = properties.uuid for _, id := range properties.deviceIDs { @@ -637,17 +671,32 @@ func readLibvirtXMLFile(data any) error { return security.ErrSecurityCtxDataAssertion } - // Get full file path - xmlFilePath := filepath.Join(d.xmlPath, d.instanceID+".xml") + // Try xml dirs one by one + var xmlFilePath string - // If file does not exist return error - _, err := os.Stat(xmlFilePath) - if err != nil { - return err + var errs error + + for _, xmlDir := range d.xmlDirs { + xmlFilePath = filepath.Join(xmlDir, d.instanceID+".xml") + + // If file exists, break + _, err := os.Stat(xmlFilePath) + if err == nil { + goto read_file + } else { + errs = errors.Join(errs, err) + } } + // If we end up here, it means we did not find xml file + if errs != nil { + return fmt.Errorf("xml file for instance %s not found. It means instance might be in shutdown state: %w", d.instanceID, errs) + } + +read_file: // Read XML file contents xmlByteArray, err := os.ReadFile(xmlFilePath) + if err != nil { return err } @@ -655,9 +704,23 @@ func readLibvirtXMLFile(data any) error { // Read XML byte array into domain var domain Domain - err = xml.Unmarshal(xmlByteArray, &domain) - if err != nil { - return err + // Based on presence of domstatus tag in xml file, + // read contents into either domain or domstatus + switch { + case strings.Contains(string(xmlByteArray), "domstatus"): + var domStatus DomStatus + + err = xml.Unmarshal(xmlByteArray, &domStatus) + if err != nil { + return err + } + + domain = domStatus.Domain + default: + err = xml.Unmarshal(xmlByteArray, &domain) + if err != nil { + return err + } } // Initialise resources pointer diff --git a/pkg/collector/libvirt_test.go b/pkg/collector/libvirt_test.go index 91134d67..4c056049 100644 --- a/pkg/collector/libvirt_test.go +++ b/pkg/collector/libvirt_test.go @@ -29,6 +29,7 @@ func TestNewLibvirtCollector(t *testing.T) { "--collector.libvirt.xml-dir", "testdata/qemu", "--collector.perf.hardware-events", "--collector.rdma.stats", + "--collector.gpu.type", "nvidia", "--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi", "--collector.cgroups.force-version", "v2", }, @@ -60,7 +61,6 @@ func TestLibvirtInstanceProps(t *testing.T) { _, err := CEEMSExporterApp.Parse( []string{ "--path.cgroupfs", "testdata/sys/fs/cgroup", - "--collector.libvirt.xml-dir", "testdata/qemu", "--collector.cgroups.force-version", "v2", "--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi", "--collector.gpu.type", "nvidia", @@ -89,12 +89,17 @@ func TestLibvirtInstanceProps(t *testing.T) { err = gpu.Discover() require.NoError(t, err) + // XML dirs + libvirtXMLDirs := defaultLibvirtXMLDirs + libvirtXMLDirs = append(libvirtXMLDirs, "testdata/qemu") + c := libvirtCollector{ gpuSMI: gpu, logger: noOpLogger, cgroupManager: cgManager, vGPUActivated: true, instanceDevicesCacheTTL: time.Second, + libvirtXMLDirs: libvirtXMLDirs, instanceIDUUIDMap: make(map[string]string), instanceDeviceslastUpdateTime: time.Now(), securityContexts: make(map[string]*security.SecurityContext), @@ -125,12 +130,24 @@ func TestLibvirtInstanceProps(t *testing.T) { "11": {{UUID: "2896bdd5-dbc2-4339-9d8e-ddd838bf35d3", NumShares: 1}}, } - expectedUUIDs := []string{"57f2d45e-8ddf-4338-91df-62d0044ff1b5", "b674a0a2-c300-4dc6-8c9c-65df16da6d69", "2896bdd5-dbc2-4339-9d8e-ddd838bf35d3", "4de89c5b-50d7-4d30-a630-14e135380fe8"} + expectedUUIDs := []string{ + "57f2d45e-8ddf-4338-91df-62d0044ff1b5", + "b674a0a2-c300-4dc6-8c9c-65df16da6d69", + "2896bdd5-dbc2-4339-9d8e-ddd838bf35d3", + "4de89c5b-50d7-4d30-a630-14e135380fe8", + } + + expectedInstanceIDs := []string{ + "instance-00000001", + "instance-00000002", + "instance-00000003", + "instance-00000004", + } cgroups, err := c.instanceCgroups() require.NoError(t, err) - assert.ElementsMatch(t, []string{"instance-00000001", "instance-00000002", "instance-00000003", "instance-00000004"}, c.previousInstanceIDs) + assert.ElementsMatch(t, expectedInstanceIDs, c.previousInstanceIDs) assert.Len(t, cgroups, 4) // Check cgroup UUIDs are properly populated @@ -180,14 +197,13 @@ func TestInstancePropsCaching(t *testing.T) { err := os.Mkdir(cgroupsPath, 0o750) require.NoError(t, err) - xmlFilePath := path + "/qemu" - err = os.Mkdir(xmlFilePath, 0o750) + xmlDir := path + "/qemu" + err = os.Mkdir(xmlDir, 0o750) require.NoError(t, err) _, err = CEEMSExporterApp.Parse( []string{ "--path.cgroupfs", cgroupsPath, - "--collector.libvirt.xml-dir", xmlFilePath, "--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi", "--collector.gpu.type", "nvidia", }, @@ -214,11 +230,16 @@ func TestInstancePropsCaching(t *testing.T) { err = gpu.Discover() require.NoError(t, err) + // XML dirs + libvirtXMLDirs := defaultLibvirtXMLDirs + libvirtXMLDirs = append(libvirtXMLDirs, xmlDir) + c := libvirtCollector{ cgroupManager: cgManager, logger: noOpLogger, gpuSMI: gpu, vGPUActivated: true, + libvirtXMLDirs: libvirtXMLDirs, instanceDevicesCacheTTL: 500 * time.Millisecond, instanceDeviceslastUpdateTime: time.Now(), instanceIDUUIDMap: make(map[string]string), @@ -250,7 +271,7 @@ func TestInstancePropsCaching(t *testing.T) { for idev, dev := range c.gpuSMI.Devices { xmlContentPH := ` -instance-%[1]d +instance-0000000%[1]d %[1]d @@ -260,17 +281,18 @@ func TestInstancePropsCaching(t *testing.T) { ` + if !dev.VGPUEnabled && !dev.InstancesEnabled { + iInstance++ xmlContent := fmt.Sprintf(xmlContentPH, idev, strconv.FormatUint(dev.BusID.bus, 16)) err = os.WriteFile( - fmt.Sprintf("%s/instance-0000000%d.xml", xmlFilePath, iInstance), + fmt.Sprintf("%s/instance-0000000%d.xml", xmlDir, iInstance), []byte(xmlContent), 0o600, ) require.NoError(t, err) fullGPUInstances = append(fullGPUInstances, idev) - iInstance++ } } @@ -285,7 +307,22 @@ func TestInstancePropsCaching(t *testing.T) { assert.Equal(t, []ComputeUnit{{UUID: strconv.FormatInt(int64(gpuID), 10), NumShares: 1}}, c.gpuSMI.Devices[gpuID].ComputeUnits) } - // Remove first 10 instances and add new 10 more instances + // Get the instance ID UUID map to compare it later + expectedInstanceIDUUIDMap := c.instanceIDUUIDMap + + // Remove all XML files + for i := range iInstance { + err = os.Remove(fmt.Sprintf("%s/instance-0000000%d.xml", xmlDir, i+1)) + require.NoError(t, err) + } + + // Now populate instancePropsCache again and we should have instanceIDUUIDMap intact + _, err = c.instanceCgroups() + require.NoError(t, err) + + assert.Equal(t, expectedInstanceIDUUIDMap, c.instanceIDUUIDMap) + + // Remove first 10 instances and add new 5 more instances for i := range 10 { dir := fmt.Sprintf("%s/cpuacct/machine.slice/machine-qemu\x2d1\x2dinstance\x2d0000000%d.scope", cgroupsPath, i) diff --git a/pkg/collector/testdata/qemu/instance-00000004.xml b/pkg/collector/testdata/qemu/instance-00000004.xml index 66434bb8..2b4faa92 100644 --- a/pkg/collector/testdata/qemu/instance-00000004.xml +++ b/pkg/collector/testdata/qemu/instance-00000004.xml @@ -5,114 +5,351 @@ OVERWRITTEN AND LOST. Changes to this xml configuration should be made using: or other application using the libvirt API. --> - - instance-00000004 - 4de89c5b-50d7-4d30-a630-14e135380fe8 - - - - dsdfs - 2024-10-04 18:10:20 - - 4096 - 20 - 0 - 0 - 4 - - - admin - admin - - - - - - - - - - 4194304 - 4194304 - 4 - - - OpenStack Foundation - OpenStack Nova - 30.1.0 - 5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e - 5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e - Virtual Machine - - - - hvm - - - - - - - - - - Nehalem - - - - - - - - destroy - restart - destroy + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - /usr/bin/qemu-system-x86_64 - - - - - 3c97a143-f7ac-4bda-b546-c1e609270f08 - -
- - - - - - - - -
- - - - - - - - - - - - - - - - -