Skip to content

Commit 7c6f48d

Browse files
authored
Add support for OFED (#254)
* add ofed role * fix ofed dependencies install * use mlnxofedinstall as recommended for Rocky8/9 now * add build of OFED image to CI * add build of OFED image to CI * fix ofed commands * default to OFED hpc package selection * fix OFED packages concatenation on RL9 * autobuild on ofed branch * always build RL8 and RL9 images * fix ofed_package_selection templating * fix ofed_build_packages * avoid OFED install timeouts * add additional packages for RL8 * bump leafcloud build size for memory issues * fix missing packages for RL9 build * remove duplication in packer definition and allow for different OFED image size * add leafcloud OFED disk size * workaround OFED/turbovnc install clash * output multiple image names * bump CI to RL8 and RL9 OFED-enabled images * bump CI images (non-OFED for RL8)
1 parent c258b1f commit 7c6f48d

File tree

11 files changed

+216
-36
lines changed

11 files changed

+216
-36
lines changed

.github/workflows/fatimage.yml

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -61,18 +61,19 @@ jobs:
6161
. environments/.stackhpc/activate
6262
cd packer/
6363
packer init .
64-
PACKER_LOG=1 packer build -only openstack.openhpc -on-error=${{ vars.PACKER_ON_ERROR }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
64+
PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
6565
env:
6666
PKR_VAR_os_version: ${{ matrix.os_version }}
6767

68-
- name: Get created image name from manifest
68+
- name: Get created image names from manifest
6969
id: manifest
7070
run: |
7171
. venv/bin/activate
72-
IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json)
73-
while ! openstack image show -f value -c name $IMAGE_ID; do
74-
sleep 30
72+
for IMAGE_ID in $(jq --raw-output '.builds[].artifact_id' packer/packer-manifest.json)
73+
do
74+
while ! openstack image show -f value -c name $IMAGE_ID; do
75+
sleep 5
76+
done
77+
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
78+
echo $IMAGE_NAME
7579
done
76-
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
77-
echo "IMAGE_ID=${IMAGE_ID}" >> "$GITHUB_OUTPUT"
78-
echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"

ansible/.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,5 @@ roles/*
5252
!roles/image_build/**
5353
!roles/persist_hostkeys/
5454
!roles/persist_hostkeys/**
55-
!roles/requirements.yml
55+
!roles/ofed/
56+
!roles/ofed/**

ansible/bootstrap.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,3 +196,11 @@
196196
- name: update facts
197197
setup:
198198
when: (sestatus.changed | default(false)) or (sestatus.reboot_required | default(false))
199+
200+
- hosts: ofed
201+
gather_facts: no
202+
become: yes
203+
tags: ofed
204+
tasks:
205+
- include_role:
206+
name: ofed

ansible/roles/ofed/README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# ofed
2+
3+
This role installs Mellanox OFED:
4+
- It checks that the running kernel is the latest installed one, and errors if not.
5+
- Installation uses the `mlnxofedinstall` command, with support for the running kernel
6+
and (by default) without firmware updates.
7+
8+
As OFED installation takes a long time generally this should only be used during image build,
9+
for example by setting:
10+
11+
```
12+
environments/groups/<environment>/groups:
13+
[ofed:children]
14+
builder
15+
```
16+
17+
# Role variables
18+
19+
See `defaults/main.yml`
20+
21+
Note ansible facts are required, unless setting `ofed_distro_version` and `ofed_arch` specifically.

ansible/roles/ofed/defaults/main.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
ofed_version: 24.01-0.3.3.1
2+
ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz
3+
ofed_distro: rhel # NB: not expected to work on other distros due to installation differences
4+
ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9'
5+
ofed_arch: "{{ ansible_architecture }}"
6+
ofed_tmp_dir: /tmp
7+
ofed_update_firmware: false
8+
ofed_build_packages: # may require additional packages depending on ofed_package_selection
9+
- autoconf
10+
- automake
11+
- gcc
12+
- gcc-gfortran
13+
- kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }}
14+
- kernel-rpm-macros
15+
- libtool
16+
- lsof
17+
- patch
18+
- pciutils
19+
- perl
20+
- rpm-build
21+
- tcl
22+
- tk
23+
ofed_build_rl8_packages:
24+
- gdb-headless
25+
- python36
26+
ofed_package_selection: # list of package selection flags for mlnxofedinstall script
27+
- hpc
28+
- with-nfsrdma

ansible/roles/ofed/tasks/install.yml

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
- name: Get installed kernels
2+
command: dnf list --installed kernel
3+
register: _ofed_dnf_kernels
4+
changed_when: false
5+
6+
- name: Determine running kernel
7+
command: uname -r # e.g. 4.18.0-513.18.1.el8_9.x86_64
8+
register: _ofed_loaded_kernel
9+
changed_when: false
10+
11+
- name: Check current kernel is newest installed
12+
assert:
13+
that: _ofed_loaded_kernel.stdout == _ofed_dnf_kernels_newest
14+
fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?"
15+
vars:
16+
_ofed_dnf_kernels_newest: >-
17+
{{ _ofed_dnf_kernels.stdout_lines[1:] | map('regex_replace', '^\w+\.(\w+)\s+(\S+)\s+\S+\s*$', '\2.\1') | community.general.version_sort | last }}
18+
# dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos "
19+
20+
- name: Enable epel
21+
dnf:
22+
name: epel-release
23+
24+
- name: Check for existing OFED installation
25+
command: ofed_info
26+
changed_when: false
27+
failed_when:
28+
- _ofed_info.rc > 0
29+
- "'No such file or directory' not in _ofed_info.msg"
30+
register: _ofed_info
31+
32+
- name: Install build prerequisites
33+
dnf:
34+
name: "{{ ofed_build_packages + (ofed_build_rl8_packages if ofed_distro_version == '8.9' else []) }}"
35+
when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout"
36+
# don't want to install a load of prereqs unnecessarily
37+
38+
- name: Download and unpack Mellanox OFED tarball
39+
ansible.builtin.unarchive:
40+
src: "{{ ofed_download_url }}"
41+
dest: "{{ ofed_tmp_dir }}"
42+
remote_src: yes
43+
become: no
44+
when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout"
45+
46+
# Below from https://docs.nvidia.com/networking/display/mlnxofedv24010331/user+manual
47+
- name: Run OFED install script
48+
command:
49+
cmd: >
50+
./mlnxofedinstall
51+
--add-kernel-support
52+
{% if not ofed_update_firmware %}--without-fw-update{% endif %}
53+
--force
54+
--skip-repo
55+
{% for pkgsel in ofed_package_selection %}
56+
--{{ pkgsel }}
57+
{% endfor %}
58+
chdir: "{{ ofed_tmp_dir }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}/"
59+
register: _ofed_install
60+
when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout"
61+
async: "{{ 45 * 60 }}" # wait for up to 45 minutes
62+
poll: 15 # check every 15 seconds
63+
64+
- name: Update initramfs
65+
command:
66+
cmd: dracut -f
67+
when: '"update your initramfs" in _ofed_install.stdout | default("")'
68+
failed_when: false # always shows errors due to deleted modules for inbox RDMA drivers
69+
70+
- name: Load the new driver
71+
command:
72+
cmd: /etc/init.d/openibd restart
73+
when: '"To load the new driver" in _ofed_install.stdout | default("")'

ansible/roles/ofed/tasks/main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
- include_tasks: install.yml

ansible/roles/openondemand/tasks/vnc_compute.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,22 @@
1010
yum:
1111
name: epel-release
1212

13+
- name: Check /etc/init.d
14+
ansible.builtin.stat:
15+
path: /etc/init.d
16+
register: init_d
17+
18+
- name: Move OFED-installed init scripts
19+
# turbovnc installs chkconfig which symlinks /etc/init.d from /etc/rc.d/init.d
20+
# but OFED has already created that and installed files in it.
21+
# See https://access.redhat.com/solutions/6969215
22+
ansible.builtin.command:
23+
cmd: mv /etc/init.d /etc/init.d.orig
24+
creates: /etc/init.d.orig
25+
when:
26+
- init_d.stat.exists
27+
- not init_d.stat.islnk
28+
1329
- name: Install VNC-related packages
1430
tags: install
1531
dnf:
@@ -19,6 +35,15 @@
1935
- python3.9
2036
- dbus-x11
2137

38+
- name: Replace OFED-installed init scripts
39+
ansible.builtin.copy:
40+
src: /etc/init.d.orig/ # trailing / to get contents
41+
dest: /etc/init.d
42+
remote_src: true
43+
when:
44+
- init_d.stat.exists
45+
- not init_d.stat.islnk
46+
2247
- name: Install Xfce desktop
2348
tags: install
2449
yum:

environments/.stackhpc/LEAFCLOUD.pkrvars.hcl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
flavor = "en1.xsmall"
1+
flavor = "ec1.medium"
22
use_blockstorage_volume = true
33
volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny
4+
volume_size_ofed = 15 # GB
45
volume_type = "unencrypted"
56
image_disk_format = "qcow2"
67
networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci

environments/.stackhpc/terraform/main.tf

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@ variable "cluster_image" {
2828
description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
2929
type = map(string)
3030
default = {
31-
# https://github.com/stackhpc/ansible-slurm-appliance/pull/382
32-
RL8: "openhpc-RL8-240327-1050-4812f852"
33-
RL9: "openhpc-RL9-240327-1026-4812f852"
31+
# https://github.com/stackhpc/ansible-slurm-appliance/pull/353
32+
RL8: "openhpc-RL8-240423-1002-4b09ba85"
33+
RL9: "openhpc-ofed-RL9-240423-1059-4b09ba85"
3434
}
3535
}
3636

0 commit comments

Comments
 (0)