Skip to content

Commit 01e57c2

Browse files
committed
migrated DRA driver to v1
1 parent fc2d0ba commit 01e57c2

File tree

16 files changed

+313
-251
lines changed

16 files changed

+313
-251
lines changed

README.md

Lines changed: 97 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ And show the initial state of available GPU devices on the worker node:
108108
$ kubectl get resourceslice -o yaml
109109
apiVersion: v1
110110
items:
111-
- apiVersion: resource.k8s.io/v1beta1
111+
- apiVersion: resource.k8s.io/v1
112112
kind: ResourceSlice
113113
metadata:
114114
creationTimestamp: "2024-12-09T16:17:09Z"
@@ -131,117 +131,109 @@ items:
131131
name: dra-example-driver-cluster-worker
132132
resourceSliceCount: 1
133133
devices:
134-
- basic:
135-
attributes:
136-
driverVersion:
137-
version: 1.0.0
138-
index:
139-
int: 0
140-
model:
141-
string: LATEST-GPU-MODEL
142-
uuid:
143-
string: gpu-18db0e85-99e9-c746-8531-ffeb86328b39
144-
capacity:
145-
memory:
146-
value: 80Gi
134+
- attributes:
135+
driverVersion:
136+
version: 1.0.0
137+
index:
138+
int: 0
139+
model:
140+
string: LATEST-GPU-MODEL
141+
uuid:
142+
string: gpu-18db0e85-99e9-c746-8531-ffeb86328b39
143+
capacity:
144+
memory:
145+
value: 80Gi
147146
name: gpu-0
148-
- basic:
149-
attributes:
150-
driverVersion:
151-
version: 1.0.0
152-
index:
153-
int: 1
154-
model:
155-
string: LATEST-GPU-MODEL
156-
uuid:
157-
string: gpu-93d37703-997c-c46f-a531-755e3e0dc2ac
158-
capacity:
159-
memory:
160-
value: 80Gi
147+
- attributes:
148+
driverVersion:
149+
version: 1.0.0
150+
index:
151+
int: 1
152+
model:
153+
string: LATEST-GPU-MODEL
154+
uuid:
155+
string: gpu-93d37703-997c-c46f-a531-755e3e0dc2ac
156+
capacity:
157+
memory:
158+
value: 80Gi
161159
name: gpu-1
162-
- basic:
163-
attributes:
164-
driverVersion:
165-
version: 1.0.0
166-
index:
167-
int: 2
168-
model:
169-
string: LATEST-GPU-MODEL
170-
uuid:
171-
string: gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744
172-
capacity:
173-
memory:
174-
value: 80Gi
160+
- attributes:
161+
driverVersion:
162+
version: 1.0.0
163+
index:
164+
int: 2
165+
model:
166+
string: LATEST-GPU-MODEL
167+
uuid:
168+
string: gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744
169+
capacity:
170+
memory:
171+
value: 80Gi
175172
name: gpu-2
176-
- basic:
177-
attributes:
178-
driverVersion:
179-
version: 1.0.0
180-
index:
181-
int: 3
182-
model:
183-
string: LATEST-GPU-MODEL
184-
uuid:
185-
string: gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243
186-
capacity:
187-
memory:
188-
value: 80Gi
173+
- attributes:
174+
driverVersion:
175+
version: 1.0.0
176+
index:
177+
int: 3
178+
model:
179+
string: LATEST-GPU-MODEL
180+
uuid:
181+
string: gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243
182+
capacity:
183+
memory:
184+
value: 80Gi
189185
name: gpu-3
190-
- basic:
191-
attributes:
192-
driverVersion:
193-
version: 1.0.0
194-
index:
195-
int: 4
196-
model:
197-
string: LATEST-GPU-MODEL
198-
uuid:
199-
string: gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747
200-
capacity:
201-
memory:
202-
value: 80Gi
186+
- attributes:
187+
driverVersion:
188+
version: 1.0.0
189+
index:
190+
int: 4
191+
model:
192+
string: LATEST-GPU-MODEL
193+
uuid:
194+
string: gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747
195+
capacity:
196+
memory:
197+
value: 80Gi
203198
name: gpu-4
204-
- basic:
205-
attributes:
206-
driverVersion:
207-
version: 1.0.0
208-
index:
209-
int: 5
210-
model:
211-
string: LATEST-GPU-MODEL
212-
uuid:
213-
string: gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e
214-
capacity:
215-
memory:
216-
value: 80Gi
199+
- attributes:
200+
driverVersion:
201+
version: 1.0.0
202+
index:
203+
int: 5
204+
model:
205+
string: LATEST-GPU-MODEL
206+
uuid:
207+
string: gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e
208+
capacity:
209+
memory:
210+
value: 80Gi
217211
name: gpu-5
218-
- basic:
219-
attributes:
220-
driverVersion:
221-
version: 1.0.0
222-
index:
223-
int: 6
224-
model:
225-
string: LATEST-GPU-MODEL
226-
uuid:
227-
string: gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac
228-
capacity:
229-
memory:
230-
value: 80Gi
212+
- attributes:
213+
driverVersion:
214+
version: 1.0.0
215+
index:
216+
int: 6
217+
model:
218+
string: LATEST-GPU-MODEL
219+
uuid:
220+
string: gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac
221+
capacity:
222+
memory:
223+
value: 80Gi
231224
name: gpu-6
232-
- basic:
233-
attributes:
234-
driverVersion:
235-
version: 1.0.0
236-
index:
237-
int: 7
238-
model:
239-
string: LATEST-GPU-MODEL
240-
uuid:
241-
string: gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b
242-
capacity:
243-
memory:
244-
value: 80Gi
225+
- attributes:
226+
driverVersion:
227+
version: 1.0.0
228+
index:
229+
int: 7
230+
model:
231+
string: LATEST-GPU-MODEL
232+
uuid:
233+
string: gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b
234+
capacity:
235+
memory:
236+
value: 80Gi
245237
name: gpu-7
246238
kind: List
247239
metadata:

cmd/dra-example-kubeletplugin/discovery.go

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import (
2121
"math/rand"
2222
"os"
2323

24-
resourceapi "k8s.io/api/resource/v1beta1"
24+
resourceapi "k8s.io/api/resource/v1"
2525
"k8s.io/apimachinery/pkg/api/resource"
2626
"k8s.io/utils/ptr"
2727

@@ -36,25 +36,23 @@ func enumerateAllPossibleDevices(numGPUs int) (AllocatableDevices, error) {
3636
for i, uuid := range uuids {
3737
device := resourceapi.Device{
3838
Name: fmt.Sprintf("gpu-%d", i),
39-
Basic: &resourceapi.BasicDevice{
40-
Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{
41-
"index": {
42-
IntValue: ptr.To(int64(i)),
43-
},
44-
"uuid": {
45-
StringValue: ptr.To(uuid),
46-
},
47-
"model": {
48-
StringValue: ptr.To("LATEST-GPU-MODEL"),
49-
},
50-
"driverVersion": {
51-
VersionValue: ptr.To("1.0.0"),
52-
},
39+
Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{
40+
"index": {
41+
IntValue: ptr.To(int64(i)),
5342
},
54-
Capacity: map[resourceapi.QualifiedName]resourceapi.DeviceCapacity{
55-
"memory": {
56-
Value: resource.MustParse("80Gi"),
57-
},
43+
"uuid": {
44+
StringValue: ptr.To(uuid),
45+
},
46+
"model": {
47+
StringValue: ptr.To("LATEST-GPU-MODEL"),
48+
},
49+
"driverVersion": {
50+
VersionValue: ptr.To("1.0.0"),
51+
},
52+
},
53+
Capacity: map[resourceapi.QualifiedName]resourceapi.DeviceCapacity{
54+
"memory": {
55+
Value: resource.MustParse("80Gi"),
5856
},
5957
},
6058
}

cmd/dra-example-kubeletplugin/driver.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@ package main
1818

1919
import (
2020
"context"
21+
"errors"
2122
"fmt"
2223
"maps"
2324

24-
resourceapi "k8s.io/api/resource/v1beta1"
25+
resourceapi "k8s.io/api/resource/v1"
2526
"k8s.io/apimachinery/pkg/types"
27+
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
2628
coreclientset "k8s.io/client-go/kubernetes"
2729
"k8s.io/dynamic-resource-allocation/kubeletplugin"
2830
"k8s.io/dynamic-resource-allocation/resourceslice"
@@ -36,11 +38,13 @@ type driver struct {
3638
helper *kubeletplugin.Helper
3739
state *DeviceState
3840
healthcheck *healthcheck
41+
cancelCtx func(error)
3942
}
4043

4144
func NewDriver(ctx context.Context, config *Config) (*driver, error) {
4245
driver := &driver{
43-
client: config.coreclient,
46+
client: config.coreclient,
47+
cancelCtx: config.cancelMainCtx,
4448
}
4549

4650
state, err := NewDeviceState(config)
@@ -149,3 +153,10 @@ func (d *driver) unprepareResourceClaim(_ context.Context, claim kubeletplugin.N
149153

150154
return nil
151155
}
156+
157+
func (d *driver) HandleError(ctx context.Context, err error, msg string) {
158+
utilruntime.HandleErrorWithContext(ctx, err, msg)
159+
if !errors.Is(err, kubeletplugin.ErrRecoverable) && d.cancelCtx != nil {
160+
d.cancelCtx(fmt.Errorf("fatal background error: %w", err))
161+
}
162+
}

cmd/dra-example-kubeletplugin/main.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,9 @@ type Flags struct {
5252
}
5353

5454
type Config struct {
55-
flags *Flags
56-
coreclient coreclientset.Interface
55+
flags *Flags
56+
coreclient coreclientset.Interface
57+
cancelMainCtx func(error)
5758
}
5859

5960
func (c Config) DriverPluginPath() string {
@@ -172,6 +173,8 @@ func RunPlugin(ctx context.Context, config *Config) error {
172173

173174
ctx, stop := signal.NotifyContext(ctx, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
174175
defer stop()
176+
ctx, cancel := context.WithCancelCause(ctx)
177+
config.cancelMainCtx = cancel
175178

176179
driver, err := NewDriver(ctx, config)
177180
if err != nil {
@@ -182,7 +185,7 @@ func RunPlugin(ctx context.Context, config *Config) error {
182185
// restore default signal behavior as soon as possible in case graceful
183186
// shutdown gets stuck.
184187
stop()
185-
if err := ctx.Err(); err != nil && !errors.Is(err, context.Canceled) {
188+
if err := context.Cause(ctx); err != nil && !errors.Is(err, context.Canceled) {
186189
// A canceled context is the normal case here when the process receives
187190
// a signal. Only log the error for more interesting cases.
188191
logger.Error(err, "error from context")

cmd/dra-example-kubeletplugin/state.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import (
2121
"slices"
2222
"sync"
2323

24-
resourceapi "k8s.io/api/resource/v1beta1"
24+
resourceapi "k8s.io/api/resource/v1"
2525
"k8s.io/apimachinery/pkg/runtime"
2626
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
2727
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"

demo/gpu-test1.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ metadata:
88
name: gpu-test1
99

1010
---
11-
apiVersion: resource.k8s.io/v1beta1
11+
apiVersion: resource.k8s.io/v1
1212
kind: ResourceClaimTemplate
1313
metadata:
1414
namespace: gpu-test1
@@ -18,7 +18,8 @@ spec:
1818
devices:
1919
requests:
2020
- name: gpu
21-
deviceClassName: gpu.example.com
21+
exactly:
22+
deviceClassName: gpu.example.com
2223

2324
---
2425
apiVersion: v1

demo/gpu-test2.yaml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ metadata:
88
name: gpu-test2
99

1010
---
11-
apiVersion: resource.k8s.io/v1beta1
11+
apiVersion: resource.k8s.io/v1
1212
kind: ResourceClaimTemplate
1313
metadata:
1414
namespace: gpu-test2
@@ -17,10 +17,12 @@ spec:
1717
spec:
1818
devices:
1919
requests:
20-
- name: gpus
21-
deviceClassName: gpu.example.com
22-
allocationMode: ExactCount
23-
count: 2
20+
- name: gpu-1
21+
exactly:
22+
deviceClassName: gpu.example.com
23+
- name: gpu-2
24+
exactly:
25+
deviceClassName: gpu.example.com
2426

2527
---
2628
apiVersion: v1

0 commit comments

Comments
 (0)