Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 76 additions & 13 deletions pkg/collector/libvirt.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,36 @@ var (
libvirtXMLDir = CEEMSExporterApp.Flag(
"collector.libvirt.xml-dir",
"Directory containing XML files of instances",
).Default("/etc/libvirt/qemu").Hidden().String()
).Default("").Hidden().String()
)

// Security context names.
const (
libvirtReadXMLCtx = "libvirt_read_xml"
)

// Domain is the top level XML field for libvirt XML schema.
// XML folder locations
// There are two different locations:
// - /etc/libvirt/qemu - Persistent files
// - /run/libvirt/qemu - Runtime files
// Runtime files and persistent files have slightly different schemas
// so we need to handle them differently. Moreover runtime schema
// files will not be present when the VM is shut down.
// On some Openstack instances, nova might not create persistent files at all
// so we need to look at both runtime and persistent directories to find
// instance properties.
//
// More info: https://github.com/ceems-dev/ceems/discussions/402.
var (
defaultLibvirtXMLDirs = []string{"/etc/libvirt/qemu", "/run/libvirt/qemu"}
)

// Domstatus is the top level XML field for runtime XML files.
type DomStatus struct {
Domain Domain `xml:"domain"`
}

// Domain is the top level XML field for persistent XML files.
type Domain struct {
Devices Devices `xml:"devices"`
Name string `xml:"name"`
Expand Down Expand Up @@ -100,7 +121,7 @@ type instanceProperties struct {
// libvirtReadXMLSecurityCtxData contains the input/output data for
// reading XML files inside a security context.
type libvirtReadXMLSecurityCtxData struct {
xmlPath string
xmlDirs []string
instanceID string
devices []Device
properties *instanceProperties
Expand All @@ -114,6 +135,7 @@ type libvirtCollector struct {
ebpfCollector *ebpfCollector
rdmaCollector *rdmaCollector
hostname string
libvirtXMLDirs []string
gpuSMI *GPUSMI
vGPUActivated bool
instanceGpuFlag *prometheus.Desc
Expand Down Expand Up @@ -230,6 +252,13 @@ func NewLibvirtCollector(logger *slog.Logger) (Collector, error) {
}
}

// In case XML files are stored in non-standard location, add them to
// paths
libvirtXMLDirs := defaultLibvirtXMLDirs
if *libvirtXMLDir != "" && !slices.Contains(libvirtXMLDirs, *libvirtXMLDir) {
libvirtXMLDirs = append(libvirtXMLDirs, *libvirtXMLDir)
}

// Setup necessary capabilities. These are the caps we need to read
// XML files in /etc/libvirt/qemu folder that contains GPU devs used by guests.
caps, err := setupAppCaps([]string{"cap_dac_read_search"})
Expand Down Expand Up @@ -259,6 +288,7 @@ func NewLibvirtCollector(logger *slog.Logger) (Collector, error) {
perfCollector: perfCollector,
ebpfCollector: ebpfCollector,
rdmaCollector: rdmaCollector,
libvirtXMLDirs: libvirtXMLDirs,
hostname: hostname,
gpuSMI: gpuSMI,
vGPUActivated: vGPUActivated,
Expand Down Expand Up @@ -497,7 +527,7 @@ func (c *libvirtCollector) updateDeviceMappers(ch chan<- prometheus.Metric) {
func (c *libvirtCollector) instanceProperties(instanceID string) *instanceProperties {
// Read XML file in a security context that raises necessary capabilities
dataPtr := &libvirtReadXMLSecurityCtxData{
xmlPath: *libvirtXMLDir,
xmlDirs: c.libvirtXMLDirs,
devices: c.gpuSMI.Devices,
instanceID: instanceID,
}
Expand Down Expand Up @@ -578,6 +608,10 @@ func (c *libvirtCollector) updateDeviceInstances(cgroups []cgroup) {

// We keep a map of instance ID to instance UUID to setup
// UUIDs when this part of code is skipped
// NOTE: Never invalidate this cache as shut down instance's XML
// files might not be present in some cases but their cgroups will be
// always present. In that case we might not be able to get UUID of those
// shutdown instances if we invalidate the cache.
c.instanceIDUUIDMap[cgrp.id] = properties.uuid

for _, id := range properties.deviceIDs {
Expand Down Expand Up @@ -637,27 +671,56 @@ func readLibvirtXMLFile(data any) error {
return security.ErrSecurityCtxDataAssertion
}

// Get full file path
xmlFilePath := filepath.Join(d.xmlPath, d.instanceID+".xml")
// Try xml dirs one by one
var xmlFilePath string

// If file does not exist return error
_, err := os.Stat(xmlFilePath)
if err != nil {
return err
var errs error

for _, xmlDir := range d.xmlDirs {
xmlFilePath = filepath.Join(xmlDir, d.instanceID+".xml")

// If file exists, break
_, err := os.Stat(xmlFilePath)
if err == nil {
goto read_file
} else {
errs = errors.Join(errs, err)
}
}

// If we end up here, it means we did not find xml file
if errs != nil {
return fmt.Errorf("xml file for instance %s not found. It means instance might be in shutdown state: %w", d.instanceID, errs)
}

read_file:
// Read XML file contents
xmlByteArray, err := os.ReadFile(xmlFilePath)

if err != nil {
return err
}

// Read XML byte array into domain
var domain Domain

err = xml.Unmarshal(xmlByteArray, &domain)
if err != nil {
return err
// Based on presence of domstatus tag in xml file,
// read contents into either domain or domstatus
switch {
case strings.Contains(string(xmlByteArray), "domstatus"):
var domStatus DomStatus

err = xml.Unmarshal(xmlByteArray, &domStatus)
if err != nil {
return err
}

domain = domStatus.Domain
default:
err = xml.Unmarshal(xmlByteArray, &domain)
if err != nil {
return err
}
}

// Initialise resources pointer
Expand Down
57 changes: 47 additions & 10 deletions pkg/collector/libvirt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ func TestNewLibvirtCollector(t *testing.T) {
"--collector.libvirt.xml-dir", "testdata/qemu",
"--collector.perf.hardware-events",
"--collector.rdma.stats",
"--collector.gpu.type", "nvidia",
"--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi",
"--collector.cgroups.force-version", "v2",
},
Expand Down Expand Up @@ -60,7 +61,6 @@ func TestLibvirtInstanceProps(t *testing.T) {
_, err := CEEMSExporterApp.Parse(
[]string{
"--path.cgroupfs", "testdata/sys/fs/cgroup",
"--collector.libvirt.xml-dir", "testdata/qemu",
"--collector.cgroups.force-version", "v2",
"--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi",
"--collector.gpu.type", "nvidia",
Expand Down Expand Up @@ -89,12 +89,17 @@ func TestLibvirtInstanceProps(t *testing.T) {
err = gpu.Discover()
require.NoError(t, err)

// XML dirs
libvirtXMLDirs := defaultLibvirtXMLDirs
libvirtXMLDirs = append(libvirtXMLDirs, "testdata/qemu")

c := libvirtCollector{
gpuSMI: gpu,
logger: noOpLogger,
cgroupManager: cgManager,
vGPUActivated: true,
instanceDevicesCacheTTL: time.Second,
libvirtXMLDirs: libvirtXMLDirs,
instanceIDUUIDMap: make(map[string]string),
instanceDeviceslastUpdateTime: time.Now(),
securityContexts: make(map[string]*security.SecurityContext),
Expand Down Expand Up @@ -125,12 +130,24 @@ func TestLibvirtInstanceProps(t *testing.T) {
"11": {{UUID: "2896bdd5-dbc2-4339-9d8e-ddd838bf35d3", NumShares: 1}},
}

expectedUUIDs := []string{"57f2d45e-8ddf-4338-91df-62d0044ff1b5", "b674a0a2-c300-4dc6-8c9c-65df16da6d69", "2896bdd5-dbc2-4339-9d8e-ddd838bf35d3", "4de89c5b-50d7-4d30-a630-14e135380fe8"}
expectedUUIDs := []string{
"57f2d45e-8ddf-4338-91df-62d0044ff1b5",
"b674a0a2-c300-4dc6-8c9c-65df16da6d69",
"2896bdd5-dbc2-4339-9d8e-ddd838bf35d3",
"4de89c5b-50d7-4d30-a630-14e135380fe8",
}

expectedInstanceIDs := []string{
"instance-00000001",
"instance-00000002",
"instance-00000003",
"instance-00000004",
}

cgroups, err := c.instanceCgroups()
require.NoError(t, err)

assert.ElementsMatch(t, []string{"instance-00000001", "instance-00000002", "instance-00000003", "instance-00000004"}, c.previousInstanceIDs)
assert.ElementsMatch(t, expectedInstanceIDs, c.previousInstanceIDs)
assert.Len(t, cgroups, 4)

// Check cgroup UUIDs are properly populated
Expand Down Expand Up @@ -180,14 +197,13 @@ func TestInstancePropsCaching(t *testing.T) {
err := os.Mkdir(cgroupsPath, 0o750)
require.NoError(t, err)

xmlFilePath := path + "/qemu"
err = os.Mkdir(xmlFilePath, 0o750)
xmlDir := path + "/qemu"
err = os.Mkdir(xmlDir, 0o750)
require.NoError(t, err)

_, err = CEEMSExporterApp.Parse(
[]string{
"--path.cgroupfs", cgroupsPath,
"--collector.libvirt.xml-dir", xmlFilePath,
"--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi",
"--collector.gpu.type", "nvidia",
},
Expand All @@ -214,11 +230,16 @@ func TestInstancePropsCaching(t *testing.T) {
err = gpu.Discover()
require.NoError(t, err)

// XML dirs
libvirtXMLDirs := defaultLibvirtXMLDirs
libvirtXMLDirs = append(libvirtXMLDirs, xmlDir)

c := libvirtCollector{
cgroupManager: cgManager,
logger: noOpLogger,
gpuSMI: gpu,
vGPUActivated: true,
libvirtXMLDirs: libvirtXMLDirs,
instanceDevicesCacheTTL: 500 * time.Millisecond,
instanceDeviceslastUpdateTime: time.Now(),
instanceIDUUIDMap: make(map[string]string),
Expand Down Expand Up @@ -250,7 +271,7 @@ func TestInstancePropsCaching(t *testing.T) {

for idev, dev := range c.gpuSMI.Devices {
xmlContentPH := `<domain type='kvm'>
<name>instance-%[1]d</name>
<name>instance-0000000%[1]d</name>
<uuid>%[1]d</uuid>
<devices>
<hostdev mode='subsystem' type='pci' managed='yes'>
Expand All @@ -260,17 +281,18 @@ func TestInstancePropsCaching(t *testing.T) {
</hostdev>
</devices>
</domain>`

if !dev.VGPUEnabled && !dev.InstancesEnabled {
iInstance++
xmlContent := fmt.Sprintf(xmlContentPH, idev, strconv.FormatUint(dev.BusID.bus, 16))
err = os.WriteFile(
fmt.Sprintf("%s/instance-0000000%d.xml", xmlFilePath, iInstance),
fmt.Sprintf("%s/instance-0000000%d.xml", xmlDir, iInstance),
[]byte(xmlContent),
0o600,
)
require.NoError(t, err)

fullGPUInstances = append(fullGPUInstances, idev)
iInstance++
}
}

Expand All @@ -285,7 +307,22 @@ func TestInstancePropsCaching(t *testing.T) {
assert.Equal(t, []ComputeUnit{{UUID: strconv.FormatInt(int64(gpuID), 10), NumShares: 1}}, c.gpuSMI.Devices[gpuID].ComputeUnits)
}

// Remove first 10 instances and add new 10 more instances
// Get the instance ID UUID map to compare it later
expectedInstanceIDUUIDMap := c.instanceIDUUIDMap

// Remove all XML files
for i := range iInstance {
err = os.Remove(fmt.Sprintf("%s/instance-0000000%d.xml", xmlDir, i+1))
require.NoError(t, err)
}

// Now populate instancePropsCache again and we should have instanceIDUUIDMap intact
_, err = c.instanceCgroups()
require.NoError(t, err)

assert.Equal(t, expectedInstanceIDUUIDMap, c.instanceIDUUIDMap)

// Remove first 10 instances and add new 5 more instances
for i := range 10 {
dir := fmt.Sprintf("%s/cpuacct/machine.slice/machine-qemu\x2d1\x2dinstance\x2d0000000%d.scope", cgroupsPath, i)

Expand Down
Loading