Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ test-e2e: $(PROMTOOL) build pkg/collector/testdata/sys/.unpacked pkg/collector/t
./scripts/e2e-test.sh -s exporter-cgroups-v2-all-metrics
./scripts/e2e-test.sh -s exporter-cgroups-v1-libvirt
./scripts/e2e-test.sh -s exporter-cgroups-v2-libvirt
./scripts/e2e-test.sh -s exporter-cgroups-v2-libvirt-nonsystemd-layout
./scripts/e2e-test.sh -s exporter-cgroups-v1-k8s
./scripts/e2e-test.sh -s exporter-cgroups-v2-k8s
./scripts/e2e-test.sh -s exporter-cgroups-v2-k8s-nogpu
Expand Down Expand Up @@ -246,6 +247,7 @@ test-e2e-update: build pkg/collector/testdata/sys/.unpacked pkg/collector/testda
./scripts/e2e-test.sh -s exporter-cgroups-v2-all-metrics -u || true
./scripts/e2e-test.sh -s exporter-cgroups-v1-libvirt -u || true
./scripts/e2e-test.sh -s exporter-cgroups-v2-libvirt -u || true
./scripts/e2e-test.sh -s exporter-cgroups-v2-libvirt-nonsystemd-layout -u || true
./scripts/e2e-test.sh -s exporter-cgroups-v1-k8s -u || true
./scripts/e2e-test.sh -s exporter-cgroups-v2-k8s -u || true
./scripts/e2e-test.sh -s exporter-cgroups-v2-k8s-nogpu -u || true
Expand Down
3 changes: 3 additions & 0 deletions pkg/collector/cgroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,10 @@ func NewCgroupManager(name manager, logger *slog.Logger) (*cgroupManager, error)
for _, slice := range []string{"machine", "machine.slice"} {
if _, err := os.Stat(filepath.Join(slicesPrefix, slice)); err == nil {
manager.slices = append(manager.slices, slice)

if slice == "machine" {
logger.Debug("Non systemd cgroup layout detected")

manager.nonSystemdLayout = true
}

Expand Down
32 changes: 29 additions & 3 deletions pkg/collector/slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -635,16 +635,42 @@ func (c *slurmCollector) jobDevices(cgroups []cgroup) {

// jobCgroups returns cgroups of active jobs.
func (c *slurmCollector) jobCgroups() ([]cgroup, error) {
// Get active cgroups
// Get current cgroups
cgroups, err := c.cgroupManager.discover()
if err != nil {
return nil, fmt.Errorf("failed to discover cgroups: %w", err)
}

// Sometimes SLURM daemon fails to clean up cgroups for
// terminated jobs. In that case our current cgroup slice will
// contain terminated jobs and it is not desirable. We clean
// up current cgroups by looking at number of procs inside each
// cgroup. When there are no procs associated with cgroup, it is
// terminated job
var activeCgroups []cgroup

var staleCgroupIDs []string

for _, cgroup := range cgroups {
if len(cgroup.procs) > 0 {
activeCgroups = append(activeCgroups, cgroup)
} else {
staleCgroupIDs = append(staleCgroupIDs, cgroup.uuid)
}
}

// If stale cgroups found, emit a warning log
if len(staleCgroupIDs) > 0 {
c.logger.Warn(
"Stale cgroups without any processes found", "ids", strings.Join(staleCgroupIDs, ","),
"num_cgroups", len(staleCgroupIDs),
)
}

// Update devices
c.jobDevices(cgroups)
c.jobDevices(activeCgroups)

return cgroups, nil
return activeCgroups, nil
}

// jobGRESResources returns GRES resources bound to current job.
Expand Down
16 changes: 10 additions & 6 deletions pkg/collector/slurm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,11 @@ func TestJobDevicesCaching(t *testing.T) {
0o600,
)
require.NoError(t, err)

procDir := fmt.Sprintf("%s/%d", procFS, i)

err = os.MkdirAll(procDir, 0o750)
require.NoError(t, err)
}

// Fake jobs
Expand All @@ -567,9 +572,6 @@ func TestJobDevicesCaching(t *testing.T) {
for ijob, gres := range mockJobs {
dir := fmt.Sprintf("%s/%d", procFS, ijob)

err = os.MkdirAll(dir, 0o750)
require.NoError(t, err)

envs := []string{fmt.Sprintf("SLURM_JOB_ID=%d", ijob), "SLURM_JOB_GPUS=" + strings.Join(gres.deviceIDs, ",")}

if gres.numShares > 0 {
Expand Down Expand Up @@ -632,6 +634,11 @@ func TestJobDevicesCaching(t *testing.T) {
0o600,
)
require.NoError(t, err)

procDir := fmt.Sprintf("%s/%d", procFS, i)

err = os.MkdirAll(procDir, 0o750)
require.NoError(t, err)
}

// Binds GPUs to first jobs 19 to 25
Expand All @@ -640,9 +647,6 @@ func TestJobDevicesCaching(t *testing.T) {

dir := fmt.Sprintf("%s/%d", procFS, jobid)

err = os.MkdirAll(dir, 0o750)
require.NoError(t, err)

envs := []string{fmt.Sprintf("SLURM_JOB_ID=%d", jobid), "SLURM_JOB_GPUS=" + strings.Join(gres.deviceIDs, ",")}

if gres.numShares > 0 {
Expand Down
Loading