Skip to content

Commit 91007d2

Browse files
authored
add fallback mechanism for allocator init failing and port upstream changes (ROCm#36)
* port latest partition changes from rocm repo * handle allocator init failure introduce fallback option in case besteffort allocator fails. we fallback to default kubelet allocation
1 parent ecb1dba commit 91007d2

File tree

4,224 files changed

+57685
-326
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

4,224 files changed

+57685
-326
lines changed

cmd/k8s-device-plugin/main.go

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,25 +32,43 @@ import (
3232

3333
var gitDescribe string
3434

35-
func getResourceList(resourceNamingStrategy string) ([]string, error) {
35+
type ResourceNamingStrategy string
36+
37+
const (
38+
StrategySingle ResourceNamingStrategy = "single"
39+
StrategyMixed ResourceNamingStrategy = "mixed"
40+
)
41+
42+
func ParseStrategy(s string) (ResourceNamingStrategy, error) {
43+
switch s {
44+
case string(StrategySingle):
45+
return StrategySingle, nil
46+
case string(StrategyMixed):
47+
return StrategyMixed, nil
48+
default:
49+
return "", fmt.Errorf("invalid resource naming strategy: %s", s)
50+
}
51+
}
52+
53+
func getResourceList(resourceNamingStrategy ResourceNamingStrategy) ([]string, error) {
3654
var resources []string
3755

3856
// Check if the node is homogeneous
3957
isHomogeneous := amdgpu.IsHomogeneous()
40-
devices, deviceCountMap := amdgpu.GetAMDGPUs()
41-
if len(devices) == 0 {
58+
partitionCountMap := amdgpu.UniquePartitionConfigCount(amdgpu.GetAMDGPUs())
59+
if len(amdgpu.GetAMDGPUs()) == 0 {
4260
return resources, nil
4361
}
4462
if isHomogeneous {
4563
// Homogeneous node will report only "gpu" resource if strategy is single. If strategy is mixed, it will report resources under the partition type name
46-
if resourceNamingStrategy == "single" {
64+
if resourceNamingStrategy == StrategySingle {
4765
resources = []string{"gpu"}
48-
} else if resourceNamingStrategy == "mixed" {
49-
if len(deviceCountMap) == 0 {
66+
} else if resourceNamingStrategy == StrategyMixed {
67+
if len(partitionCountMap) == 0 {
5068
// If partitioning is not supported on the node, we should report resources under "gpu" regardless of the strategy
5169
resources = []string{"gpu"}
5270
} else {
53-
for partitionType, count := range deviceCountMap {
71+
for partitionType, count := range partitionCountMap {
5472
if count > 0 {
5573
resources = append(resources, partitionType)
5674
}
@@ -59,10 +77,10 @@ func getResourceList(resourceNamingStrategy string) ([]string, error) {
5977
}
6078
} else {
6179
// Heterogeneous node reports resources based on partition types if strategy is mixed. Heterogeneous is not allowed if Strategy is single
62-
if resourceNamingStrategy == "single" {
80+
if resourceNamingStrategy == StrategySingle {
6381
return resources, fmt.Errorf("Partitions of different styles across GPUs in a node is not supported with single strategy. Please start device plugin with mixed strategy")
64-
} else if resourceNamingStrategy == "mixed" {
65-
for partitionType, count := range deviceCountMap {
82+
} else if resourceNamingStrategy == StrategyMixed {
83+
for partitionType, count := range partitionCountMap {
6684
if count > 0 {
6785
resources = append(resources, partitionType)
6886
}
@@ -92,6 +110,11 @@ func main() {
92110
flag.StringVar(&resourceNamingStrategy, "resource_naming_strategy", "single", "Resource strategy to be used: single or mixed")
93111
// this is also needed to enable glog usage in dpm
94112
flag.Parse()
113+
strategy, err := ParseStrategy(resourceNamingStrategy)
114+
if err != nil {
115+
glog.Errorf("%v", err)
116+
os.Exit(1)
117+
}
95118

96119
for _, v := range versions {
97120
glog.Infof("%s", v)
@@ -117,7 +140,7 @@ func main() {
117140
// /sys/class/kfd only exists if ROCm kernel/driver is installed
118141
var path = "/sys/class/kfd"
119142
if _, err := os.Stat(path); err == nil {
120-
resources, err := getResourceList(resourceNamingStrategy)
143+
resources, err := getResourceList(strategy)
121144
if err != nil {
122145
glog.Errorf("Error occured: %v", err)
123146
os.Exit(1)

cmd/k8s-node-labeller/main.go

Lines changed: 24 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -353,46 +353,36 @@ var labelGenerators = map[string]func(map[string]map[string]interface{}) map[str
353353

354354
return createLabels("cu-count", counts)
355355
},
356-
}
357-
358-
var labelProperties = make(map[string]*bool, len(labelGenerators))
359-
360-
func generatePartitionLabels() map[string]string {
361-
_, deviceCountMap := amdgpu.GetAMDGPUs()
362-
isHomogeneous := amdgpu.IsHomogeneous()
363-
IsComputePartitionSupported := amdgpu.IsComputePartitionSupported()
364-
IsMemoryPartitionSupported := amdgpu.IsMemoryPartitionSupported()
365-
366-
labels := make(map[string]string)
367-
368-
if isHomogeneous {
369-
// Iterate through deviceCountMap and find the partition type with count > 0
370-
for partitionType, count := range deviceCountMap {
371-
if count > 0 {
372-
labels[partitionTypeLabel] = partitionType
373-
break
356+
"compute-memory-partition": func(gpus map[string]map[string]interface{}) map[string]string {
357+
partitionCountMap := amdgpu.UniquePartitionConfigCount(gpus)
358+
isHomogeneous := amdgpu.IsHomogeneous()
359+
if isHomogeneous {
360+
for partitionType, count := range partitionCountMap {
361+
if count > 0 {
362+
pfx := createLabelPrefix("compute-memory-partition", false)
363+
return map[string]string{pfx: partitionType}
364+
}
374365
}
375366
}
376-
}
377-
378-
if IsComputePartitionSupported {
379-
labels[computePartitioningSupportedLabel] = "true"
380-
} else {
381-
labels[computePartitioningSupportedLabel] = "false"
382-
}
383-
384-
if IsMemoryPartitionSupported {
385-
labels[memoryPartitioningSupportedLabel] = "true"
386-
} else {
387-
labels[memoryPartitioningSupportedLabel] = "false"
388-
}
389-
390-
return labels
367+
return map[string]string{}
368+
},
369+
"compute-partitioning-supported": func(gpus map[string]map[string]interface{}) map[string]string {
370+
val := strconv.FormatBool(amdgpu.IsComputePartitionSupported())
371+
pfx := createLabelPrefix("compute-partitioning-supported", false)
372+
return map[string]string{pfx: val}
373+
},
374+
"memory-partitioning-supported": func(gpus map[string]map[string]interface{}) map[string]string {
375+
val := strconv.FormatBool(amdgpu.IsMemoryPartitionSupported())
376+
pfx := createLabelPrefix("memory-partitioning-supported", false)
377+
return map[string]string{pfx: val}
378+
},
391379
}
392380

381+
var labelProperties = make(map[string]*bool, len(labelGenerators))
382+
393383
func generateLabels(lblProps map[string]*bool) map[string]string {
394384
results := make(map[string]string, len(labelGenerators))
395-
gpus, _ := amdgpu.GetAMDGPUs()
385+
gpus := amdgpu.GetAMDGPUs()
396386

397387
for l, f := range labelGenerators {
398388
if !*lblProps[l] {
@@ -403,13 +393,6 @@ func generateLabels(lblProps map[string]*bool) map[string]string {
403393
results[k] = v
404394
}
405395
}
406-
407-
// Add the new GPU labels
408-
gpuLabels := generatePartitionLabels()
409-
for k, v := range gpuLabels {
410-
results[k] = v
411-
}
412-
413396
return results
414397
}
415398

cmd/k8s-node-labeller/main_test.go

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,29 +10,32 @@ import (
1010

1111
var (
1212
expectedAllLabelKeys = map[string]bool{
13-
"amd.com/gpu.family": true,
14-
"amd.com/gpu.driver-version": true,
15-
"amd.com/gpu.driver-src-version": true,
16-
"amd.com/gpu.firmware": true,
17-
"amd.com/gpu.device-id": true,
18-
"amd.com/gpu.product-name": true,
19-
"amd.com/gpu.vram": true,
20-
"amd.com/gpu.simd-count": true,
21-
"amd.com/gpu.cu-count": true,
22-
"amd.com/compute-partitioning-supported": true,
23-
"amd.com/memory-partitioning-supported": true,
24-
"amd.com/compute-memory-partition": true,
13+
"amd.com/gpu.family": true,
14+
"amd.com/gpu.driver-version": true,
15+
"amd.com/gpu.driver-src-version": true,
16+
"amd.com/gpu.firmware": true,
17+
"amd.com/gpu.device-id": true,
18+
"amd.com/gpu.product-name": true,
19+
"amd.com/gpu.vram": true,
20+
"amd.com/gpu.simd-count": true,
21+
"amd.com/gpu.cu-count": true,
22+
"amd.com/gpu.compute-memory-partition": true,
23+
"amd.com/gpu.compute-partitioning-supported": true,
24+
"amd.com/gpu.memory-partitioning-supported": true,
2525
}
2626
expectedAllExperimentalLabelKeys = map[string]bool{
27-
"beta.amd.com/gpu.family": true,
28-
"beta.amd.com/gpu.driver-version": true,
29-
"beta.amd.com/gpu.driver-src-version": true,
30-
"beta.amd.com/gpu.firmware": true,
31-
"beta.amd.com/gpu.device-id": true,
32-
"beta.amd.com/gpu.product-name": true,
33-
"beta.amd.com/gpu.vram": true,
34-
"beta.amd.com/gpu.simd-count": true,
35-
"beta.amd.com/gpu.cu-count": true,
27+
"beta.amd.com/gpu.family": true,
28+
"beta.amd.com/gpu.driver-version": true,
29+
"beta.amd.com/gpu.driver-src-version": true,
30+
"beta.amd.com/gpu.firmware": true,
31+
"beta.amd.com/gpu.device-id": true,
32+
"beta.amd.com/gpu.product-name": true,
33+
"beta.amd.com/gpu.vram": true,
34+
"beta.amd.com/gpu.simd-count": true,
35+
"beta.amd.com/gpu.cu-count": true,
36+
"beta.amd.com/gpu.compute-memory-partition": true,
37+
"beta.amd.com/gpu.compute-partitioning-supported": true,
38+
"beta.amd.com/gpu.memory-partitioning-supported": true,
3639
}
3740
)
3841

@@ -91,6 +94,26 @@ func TestRemoveOldNodeLabels(t *testing.T) {
9194
"dummyLabel2": "2",
9295
},
9396
},
97+
{
98+
inputNode: &corev1.Node{
99+
ObjectMeta: metav1.ObjectMeta{
100+
Labels: map[string]string{
101+
"amd.com/cpu": "true",
102+
"amd.com/gpu": "true",
103+
"amd.com/mi300x": "true",
104+
"dummyLabel1": "1",
105+
"dummyLabel2": "2",
106+
},
107+
},
108+
},
109+
expectLabels: map[string]string{
110+
"amd.com/cpu": "true",
111+
"amd.com/gpu": "true",
112+
"amd.com/mi300x": "true",
113+
"dummyLabel1": "1",
114+
"dummyLabel2": "2",
115+
},
116+
},
94117
}
95118

96119
for _, tc := range testCases {

docs/conf.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,29 @@
44
# list see the documentation:
55
# https://www.sphinx-doc.org/en/master/usage/configuration.html
66

7-
# configurations for PDF output by Read the Docs
8-
project = "k8s-device-plugin Documentation"
9-
author = "Advanced Micro Devices, Inc."
10-
copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
11-
version = "6.2.0"
12-
release = "6.2.0"
137

14-
external_toc_path = "./sphinx/_toc.yml"
8+
external_projects_local_file = "projects.yaml"
9+
external_projects_remote_repository = ""
10+
#external_projects = ["k8s-device-plugin"]
11+
external_projects = []
12+
external_projects_current_project = "k8s-device-plugin"
13+
14+
project = "AMD Kubernetes Device Plugin Documentation"
15+
version = "1.3.1"
16+
release = version
17+
html_title = f"Device Plugin Documentation {version}"
18+
author = "Advanced Micro Devices, Inc."
19+
copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
1520

21+
# Required settings
22+
html_theme = "rocm_docs_theme"
23+
html_theme_options = {
24+
"flavor": "instinct"
25+
}
1626
extensions = ["rocm_docs"]
1727

18-
external_projects_current_project = "rocm"
28+
external_toc_path = "./sphinx/_toc.yml"
1929

20-
html_theme = "rocm_docs_theme"
21-
html_theme_options = {"flavor": "rocm-docs-home"}
30+
extensions = ["rocm_docs"]
2231

23-
html_title = project
32+
exclude_patterns = ['.venv']

docs/contributing/development.md

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Development Guidelines
2+
3+
## Getting Started
4+
5+
### Prerequisites
6+
7+
Before you begin development, ensure you have the following prerequisites:
8+
9+
- A working installation of Go (version 1.23 or later).
10+
- Access to a Kubernetes cluster for testing.
11+
- Familiarity with Git and GitHub workflows.
12+
13+
### Cloning the Repository
14+
15+
To contribute to the project, start by cloning the repository:
16+
17+
```bash
18+
git clone https://github.com/ROCm/k8s-device-plugin.git
19+
cd k8s-device-plugin
20+
```
21+
22+
### Branching Strategy
23+
24+
When working on a new feature or bug fix, create a new branch from the `main` branch:
25+
26+
```bash
27+
git checkout -b feature/my-new-feature
28+
```
29+
30+
Make sure to name your branch descriptively to reflect the changes you are making.
31+
32+
## Development Workflow
33+
34+
1. **Make Changes**: Implement your changes in the codebase.
35+
2. **Testing**: Ensure that your changes are covered by tests. Run existing tests and add new ones as necessary.
36+
3. **Commit Changes**: Commit your changes with a clear and concise commit message:
37+
38+
```bash
39+
git commit -m "Add feature X to improve Y"
40+
```
41+
42+
4. **Push Changes**: Push your branch to the remote repository:
43+
44+
```bash
45+
git push origin feature/my-new-feature
46+
```
47+
48+
5. **Create a Pull Request**: Navigate to the GitHub repository and create a pull request. Provide a detailed description of your changes and why they are necessary.
49+
50+
## Code Review Process
51+
52+
All contributions will undergo a code review process. Reviewers will assess the quality of the code, adherence to coding standards, and the completeness of tests. Be open to feedback and ready to make adjustments as needed.

0 commit comments

Comments
 (0)