Skip to content

Commit 8e0bfc2

Browse files
committed
Add basic support for GPUs that can be partitioned
1 parent ab22d52 commit 8e0bfc2

File tree

4 files changed

+88
-4
lines changed

4 files changed

+88
-4
lines changed

cmd/k8s-device-plugin/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ func (p *Plugin) PreStartContainer(ctx context.Context, r *pluginapi.PreStartCon
128128
func (p *Plugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
129129
p.AMDGPUs = amdgpu.GetAMDGPUs()
130130

131+
glog.Infof("Found %d AMDGPUs", len(p.AMDGPUs))
132+
131133
devs := make([]*pluginapi.Device, len(p.AMDGPUs))
132134

133135
// limit scope for hwloc

internal/pkg/amdgpu/amdgpu.go

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,21 +95,52 @@ func GetAMDGPUs() map[string]map[string]int {
9595
matches, _ := filepath.Glob("/sys/module/amdgpu/drivers/pci:amdgpu/[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]:*")
9696

9797
devices := make(map[string]map[string]int)
98+
card, renderD := 0, 128
9899

99100
for _, path := range matches {
100101
glog.Info(path)
101102
devPaths, _ := filepath.Glob(path + "/drm/*")
102-
devices[filepath.Base(path)] = make(map[string]int)
103103

104104
for _, devPath := range devPaths {
105105
switch name := filepath.Base(devPath); {
106106
case name[0:4] == "card":
107-
devices[filepath.Base(path)][name[0:4]], _ = strconv.Atoi(name[4:])
107+
card, _ = strconv.Atoi(name[4:])
108108
case name[0:7] == "renderD":
109-
devices[filepath.Base(path)][name[0:7]], _ = strconv.Atoi(name[7:])
109+
renderD, _ = strconv.Atoi(name[7:])
110110
}
111111
}
112+
113+
devices[filepath.Base(path)] = map[string]int{"card": card, "renderD": renderD}
114+
}
115+
116+
// certain products have additional devices (such as MI300's partitions)
117+
//ex: /sys/devices/platform/amdgpu_xcp_30
118+
platformMatches, _ := filepath.Glob("/sys/devices/platform/amdgpu_xcp_*")
119+
120+
// This is needed because some of the visible renderD are actually not valid
121+
// Their validity depends on topology information from KFD
122+
topoRenderNodes := renderNodeSetFromTopology()
123+
124+
for _, path := range platformMatches {
125+
glog.Info(path)
126+
devPaths, _ := filepath.Glob(path + "/drm/*")
127+
128+
for _, devPath := range devPaths {
129+
switch name := filepath.Base(devPath); {
130+
case name[0:4] == "card":
131+
card, _ = strconv.Atoi(name[4:])
132+
case name[0:7] == "renderD":
133+
renderD, _ = strconv.Atoi(name[7:])
134+
}
135+
}
136+
137+
if !topoRenderNodes[renderD] {
138+
continue
139+
}
140+
141+
devices[filepath.Base(path)] = map[string]int{"card": card, "renderD": renderD}
112142
}
143+
113144
return devices
114145
}
115146

@@ -274,3 +305,38 @@ func parseDebugFSFirmwareInfo(path string) (map[string]uint32, map[string]uint32
274305

275306
return feat, fw
276307
}
308+
309+
var topoDrmRenderMinorRe = regexp.MustCompile(`drm_render_minor\s(\d+)`)
310+
311+
func renderNodeSetFromTopology(topoRootParam ...string) map[int]bool {
312+
topoRoot := "/sys/class/kfd/kfd"
313+
if len(topoRootParam) == 1 {
314+
topoRoot = topoRootParam[0]
315+
}
316+
317+
renderNodes := make(map[int]bool)
318+
var nodeFiles []string
319+
var err error
320+
321+
if nodeFiles, err = filepath.Glob(topoRoot + "/topology/nodes/*/properties"); err != nil {
322+
glog.Fatalf("glob error: %s", err)
323+
return renderNodes
324+
}
325+
326+
for _, nodeFile := range nodeFiles {
327+
glog.Info("Parsing " + nodeFile)
328+
v, e := ParseTopologyProperties(nodeFile, topoDrmRenderMinorRe)
329+
if e != nil {
330+
glog.Error(e)
331+
continue
332+
}
333+
334+
if v <= 0 {
335+
continue
336+
}
337+
338+
renderNodes[int(v)] = true
339+
}
340+
341+
return renderNodes
342+
}

internal/pkg/amdgpu/amdgpu_test.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717
package amdgpu
1818

1919
import (
20+
"encoding/json"
2021
"fmt"
2122
"io/ioutil"
2223
"path/filepath"
24+
"reflect"
2325
"regexp"
2426
"strings"
2527
"testing"
@@ -213,3 +215,17 @@ func TestParseDebugFSFirmwareInfo(t *testing.T) {
213215
t.Errorf("Incorrect parsing of amdgpu firmware info from debugfs")
214216
}
215217
}
218+
219+
func TestRenderNodeSetFromTopology(t *testing.T) {
220+
renderNodes := renderNodeSetFromTopology("../../../testdata/topology-parsing-mi308")
221+
222+
expNodes := map[int]bool{128: true, 129: true}
223+
if !reflect.DeepEqual(renderNodes, expNodes) {
224+
val, _ := json.MarshalIndent(renderNodes, "", " ")
225+
exp, _ := json.MarshalIndent(expNodes, "", " ")
226+
227+
t.Errorf("RenderNode set was incorrect")
228+
t.Errorf("Got: %s", val)
229+
t.Errorf("Want: %s", exp)
230+
}
231+
}

testdata/topology-parsing/topology/nodes/2/properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ max_slots_scratch_cu 32
1717
vendor_id 4098
1818
device_id 26720
1919
location_id 6400
20-
drm_render_minor 128
20+
drm_render_minor 129
2121
max_engine_clk_fcompute 1500
2222
local_mem_size 17163091968
2323
fw_version 392

0 commit comments

Comments
 (0)