@@ -95,21 +95,52 @@ func GetAMDGPUs() map[string]map[string]int {
9595 matches , _ := filepath .Glob ("/sys/module/amdgpu/drivers/pci:amdgpu/[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]:*" )
9696
9797 devices := make (map [string ]map [string ]int )
98+ card , renderD := 0 , 128
9899
99100 for _ , path := range matches {
100101 glog .Info (path )
101102 devPaths , _ := filepath .Glob (path + "/drm/*" )
102- devices [filepath .Base (path )] = make (map [string ]int )
103103
104104 for _ , devPath := range devPaths {
105105 switch name := filepath .Base (devPath ); {
106106 case name [0 :4 ] == "card" :
107- devices [ filepath . Base ( path )][ name [ 0 : 4 ]] , _ = strconv .Atoi (name [4 :])
107+ card , _ = strconv .Atoi (name [4 :])
108108 case name [0 :7 ] == "renderD" :
109- devices [ filepath . Base ( path )][ name [ 0 : 7 ]] , _ = strconv .Atoi (name [7 :])
109+ renderD , _ = strconv .Atoi (name [7 :])
110110 }
111111 }
112+
113+ devices [filepath .Base (path )] = map [string ]int {"card" : card , "renderD" : renderD }
114+ }
115+
116+ // certain products have additional devices (such as MI300's partitions)
117+ //ex: /sys/devices/platform/amdgpu_xcp_30
118+ platformMatches , _ := filepath .Glob ("/sys/devices/platform/amdgpu_xcp_*" )
119+
120+ // This is needed because some of the visible renderD are actually not valid
121+ // Their validity depends on topology information from KFD
122+ topoRenderNodes := renderNodeSetFromTopology ()
123+
124+ for _ , path := range platformMatches {
125+ glog .Info (path )
126+ devPaths , _ := filepath .Glob (path + "/drm/*" )
127+
128+ for _ , devPath := range devPaths {
129+ switch name := filepath .Base (devPath ); {
130+ case name [0 :4 ] == "card" :
131+ card , _ = strconv .Atoi (name [4 :])
132+ case name [0 :7 ] == "renderD" :
133+ renderD , _ = strconv .Atoi (name [7 :])
134+ }
135+ }
136+
137+ if ! topoRenderNodes [renderD ] {
138+ continue
139+ }
140+
141+ devices [filepath .Base (path )] = map [string ]int {"card" : card , "renderD" : renderD }
112142 }
143+
113144 return devices
114145}
115146
@@ -274,3 +305,38 @@ func parseDebugFSFirmwareInfo(path string) (map[string]uint32, map[string]uint32
274305
275306 return feat , fw
276307}
308+
309+ var topoDrmRenderMinorRe = regexp .MustCompile (`drm_render_minor\s(\d+)` )
310+
311+ func renderNodeSetFromTopology (topoRootParam ... string ) map [int ]bool {
312+ topoRoot := "/sys/class/kfd/kfd"
313+ if len (topoRootParam ) == 1 {
314+ topoRoot = topoRootParam [0 ]
315+ }
316+
317+ renderNodes := make (map [int ]bool )
318+ var nodeFiles []string
319+ var err error
320+
321+ if nodeFiles , err = filepath .Glob (topoRoot + "/topology/nodes/*/properties" ); err != nil {
322+ glog .Fatalf ("glob error: %s" , err )
323+ return renderNodes
324+ }
325+
326+ for _ , nodeFile := range nodeFiles {
327+ glog .Info ("Parsing " + nodeFile )
328+ v , e := ParseTopologyProperties (nodeFile , topoDrmRenderMinorRe )
329+ if e != nil {
330+ glog .Error (e )
331+ continue
332+ }
333+
334+ if v <= 0 {
335+ continue
336+ }
337+
338+ renderNodes [int (v )] = true
339+ }
340+
341+ return renderNodes
342+ }
0 commit comments