Skip to content

Commit 7509986

Browse files
authored
Add support for configuring Multi-Instance GPUs (MIG) (#656)
* Adds support for NVIDIA MIG configuration * Bump openhpc to fix gres validation * Bump disk size for build * Bump volume size for extra build * Revert bump to volume size
1 parent eabf59b commit 7509986

File tree

15 files changed

+362
-3
lines changed

15 files changed

+362
-3
lines changed

.github/workflows/extra.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ jobs:
3232
- image_name: openhpc-extra-RL8
3333
source_image_name_key: RL8 # key into environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
3434
inventory_groups: doca,cuda,lustre
35-
volume_size: 30 # needed for cuda
35+
volume_size: 35 # needed for cuda
3636
- image_name: openhpc-extra-RL9
3737
source_image_name_key: RL9
3838
inventory_groups: doca,cuda,lustre
39-
volume_size: 30 # needed for cuda
39+
volume_size: 35 # needed for cuda
4040
env:
4141
ANSIBLE_FORCE_COLOR: True
4242
OS_CLOUD: openstack

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,5 +90,7 @@ roles/*
9090
!roles/gateway/**
9191
!roles/alertmanager/
9292
!roles/alertmanager/**
93+
!roles/slurm_recompile/**
94+
!roles/slurm_recompile/**
9395
!roles/nhc/
9496
!roles/nhc/**

ansible/extras.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,20 @@
4848
name: cuda
4949
tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
5050

51+
- name: Setup vGPU
52+
hosts: vgpu
53+
become: yes
54+
gather_facts: yes
55+
tags: vgpu
56+
tasks:
57+
- include_role:
58+
name: stackhpc.linux.vgpu
59+
tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}"
60+
handlers:
61+
- name: reboot
62+
fail:
63+
msg: Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable.
64+
5165
- name: Persist hostkeys across rebuilds
5266
# Must be after filesystems.yml (for storage)
5367
# and before portal.yml (where OOD login node hostkeys are scanned)

ansible/fatimage.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,16 @@
250250
name: cloudalchemy.grafana
251251
tasks_from: install.yml
252252

253+
- name: Add support for NVIDIA GPU auto detection to Slurm
254+
hosts: cuda
255+
become: yes
256+
tasks:
257+
- name: Recompile slurm
258+
import_role:
259+
name: slurm_recompile
260+
vars:
261+
slurm_recompile_with_nvml: "{{ groups.cuda | length > 0 }}"
262+
253263
- name: Run post.yml hook
254264
vars:
255265
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"

ansible/roles/compute_init/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ it also requires an image build with the role name added to the
7575
| extras.yml | basic_users | All functionality [6] | No |
7676
| extras.yml | eessi | All functionality [7] | No |
7777
| extras.yml | cuda | None required - use image build | Yes [8] |
78+
| extras.yml | vgpu | All functionality | Yes |
7879
| extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a |
7980
| extras.yml | compute_init (export) | Not relevant for compute nodes | n/a |
8081
| extras.yml | k9s (install) | Not relevant during boot | n/a |

ansible/roles/compute_init/files/compute-init.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}"
2020
enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}"
2121
enable_chrony: "{{ os_metadata.meta.chrony | default(false) | bool }}"
22+
enable_vgpu: "{{ os_metadata.meta.vpgu | default(false) | bool }}"
2223
enable_nhc: "{{ os_metadata.meta.nhc | default(false) | bool }}"
2324

2425
# TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
@@ -296,6 +297,12 @@
296297
cmd: "cvmfs_config setup"
297298
when: enable_eessi
298299

300+
- name: Configure VGPUs
301+
include_role:
302+
name: stackhpc.linux.vgpu
303+
tasks_from: 'configure.yml'
304+
when: enable_vgpu
305+
299306
# NB: don't need conditional block on enable_compute as have already exited
300307
# if not the case
301308
- name: Write Munge key

ansible/roles/cuda/tasks/facts.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
- name: Set cuda_facts_version_short
3+
set_fact:
4+
cuda_facts_version_short: "{{ cuda_version_short }}"
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# slurm_recompile
2+
=================
3+
4+
Recompiles slurm from source RPMs and installs the packages that were built.
5+
6+
Requirements
7+
------------
8+
9+
Role Variables
10+
--------------
11+
12+
See `defaults/main.yml`.
13+
14+
Dependencies
15+
------------
16+
17+
Example Playbook
18+
----------------
19+
20+
- hosts: compute
21+
tasks:
22+
- import_role:
23+
name: slurm_recompile
24+
25+
License
26+
-------
27+
28+
Apache-2.0
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
# Whether to link slurm against the NVIDIA management library
3+
slurm_recompile_with_nvml: false
4+
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
---
2+
- name: Get facts about CUDA installation
3+
import_role:
4+
name: cuda
5+
tasks_from: facts.yml
6+
7+
- name: Gather the package facts
8+
ansible.builtin.package_facts:
9+
manager: auto
10+
11+
- name: Set fact containing slurm package facts
12+
set_fact:
13+
slurm_package: "{{ ansible_facts.packages['slurm-slurmd-ohpc'].0 }}"
14+
15+
- name: Recompile and install slurm packages
16+
shell: |
17+
#!/bin/bash
18+
source /etc/profile
19+
set -eux
20+
dnf download -y --source slurm-slurmd-ohpc-{{ slurm_package.version }}-{{ slurm_package.release }}
21+
rpm -i slurm-ohpc-*.src.rpm
22+
cd /root/rpmbuild/SPECS
23+
dnf builddep -y slurm.spec
24+
rpmbuild -bb{% if slurm_recompile_with_nvml | bool %} -D "_with_nvml --with-nvml=/usr/local/cuda-{{ cuda_facts_version_short }}/targets/x86_64-linux/"{% endif %} slurm.spec
25+
dnf reinstall -y /root/rpmbuild/RPMS/x86_64/*.rpm
26+
become: true
27+
28+
- name: Workaround missing symlink
29+
# Workaround path issue: https://groups.google.com/g/slurm-users/c/cvGb4JnK8BY
30+
command: ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so
31+
args:
32+
creates: /lib64/libnvidia-ml.so
33+
when: slurm_recompile_with_nvml | bool
34+
35+
- name: Cleanup Dependencies
36+
shell: |
37+
#!/bin/bash
38+
set -eux
39+
set -o pipefail
40+
dnf history list | grep Install | grep 'builddep -y slurm.spec' | head -n 1 | awk '{print $1}' | xargs dnf history -y undo
41+
become: true

0 commit comments

Comments
 (0)