Skip to content

Commit c4469a5

Browse files
committed
Don't add finalizer if Workload doesn't have TopologyAssignment.
1 parent 04c024c commit c4469a5

File tree

6 files changed

+274
-97
lines changed

6 files changed

+274
-97
lines changed

slice/config/rbac/role.yaml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,6 @@ rules:
3030
- list
3131
- update
3232
- watch
33-
- apiGroups:
34-
- jobset.x-k8s.io
35-
resources:
36-
- jobsets
37-
verbs:
38-
- get
39-
- list
40-
- watch
4133
- apiGroups:
4234
- kueue.x-k8s.io
4335
resources:

slice/internal/controller/indexer.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,17 @@ import (
2828
)
2929

3030
const (
31-
OwnerReferenceUID = "metadata.ownerReferences.uid"
31+
OwnerReferenceName = "metadata.ownerReferences.name"
3232
)
3333

34-
func IndexOwnerReferenceUID(obj client.Object) []string {
35-
return slices.Map(obj.GetOwnerReferences(), func(o *metav1.OwnerReference) string { return string(o.UID) })
34+
func IndexOwnerReferenceName(obj client.Object) []string {
35+
return slices.Map(obj.GetOwnerReferences(), func(o *metav1.OwnerReference) string { return o.Name })
3636
}
3737

3838
// Setup sets the index with the given fields for core apis.
3939
func Setup(ctx context.Context, indexer client.FieldIndexer) error {
40-
if err := indexer.IndexField(ctx, &kueue.Workload{}, OwnerReferenceUID, IndexOwnerReferenceUID); err != nil {
41-
return fmt.Errorf("setting index on ownerReferences.uid for Workload: %w", err)
40+
if err := indexer.IndexField(ctx, &kueue.Workload{}, OwnerReferenceName, IndexOwnerReferenceName); err != nil {
41+
return fmt.Errorf("setting index on ownerReferences.name for Workload: %w", err)
4242
}
4343
return nil
4444
}

slice/internal/controller/workload_controller.go

Lines changed: 52 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ import (
3535
"sigs.k8s.io/controller-runtime/pkg/predicate"
3636
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3737
jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2"
38+
kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1"
3839
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
39-
"sigs.k8s.io/kueue/pkg/controller/core/indexer"
4040
"sigs.k8s.io/kueue/pkg/workload"
4141

4242
"tpu-slice-controller/api/v1alpha1"
@@ -45,6 +45,8 @@ import (
4545
const (
4646
CleanupSliceFinalizerName = "accelerator.gke.io/slice"
4747
TPUReservationSubblockLabel = "cloud.google.com/gke-tpu-reservation-subblock"
48+
49+
tasLabelValue = "true"
4850
)
4951

5052
var (
@@ -69,7 +71,6 @@ func NewWorkloadReconciler(cl client.Client) *WorkloadReconciler {
6971
// +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads,verbs=get;list;watch;create;update;patch
7072
// +kubebuilder:rbac:groups=slice.accelerator.gke.io,resources=slices,verbs=get;list;watch;create;update;patch;delete
7173
// +kubebuilder:rbac:groups=slice.accelerator.gke.io,resources=slices/finalizers,verbs=update
72-
// +kubebuilder:rbac:groups=jobset.x-k8s.io,resources=jobsets,verbs=get;list;watch
7374
// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch
7475

7576
func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
@@ -137,6 +138,7 @@ func (r *WorkloadReconciler) finalize(ctx context.Context, wl *kueue.Workload) e
137138

138139
controllerutil.RemoveFinalizer(wl, CleanupSliceFinalizerName)
139140
if err := r.client.Update(ctx, wl); err != nil {
141+
log.Error(err, "Removing finalizer")
140142
return err
141143
}
142144

@@ -151,25 +153,18 @@ func (r *WorkloadReconciler) podsGracefullyTerminated(ctx context.Context, wl *k
151153
}
152154

153155
owner := metav1.GetControllerOf(wl)
154-
155-
jobSet := &jobset.JobSet{}
156-
err := r.client.Get(ctx, types.NamespacedName{Name: owner.Name, Namespace: wl.Namespace}, jobSet)
157-
if client.IgnoreNotFound(err) != nil {
158-
return false, err
159-
}
160-
// That means the JobSet has already been deleted, along with all associated Jobs and Pods
161-
// we should delete Slice and finalize Workload.
162-
if err != nil {
163-
//nolint:nilerr // Return nil to delete Slice and finalize Workload.
156+
if owner == nil {
157+
// That means the JobSet has already been deleted, along with all associated Jobs and Pods
158+
// we should delete Slice and finalize Workload.
164159
return true, nil
165160
}
166161

167162
pods := &corev1.PodList{}
168163
opts := []client.ListOption{
169-
client.InNamespace(jobSet.Namespace),
170-
client.MatchingLabels{jobset.JobSetNameKey: jobSet.Name},
164+
client.InNamespace(wl.Namespace),
165+
client.MatchingLabels{jobset.JobSetNameKey: owner.Name},
171166
}
172-
err = r.client.List(ctx, pods, opts...)
167+
err := r.client.List(ctx, pods, opts...)
173168
if err != nil {
174169
return false, err
175170
}
@@ -241,83 +236,100 @@ func (r *WorkloadReconciler) SetupWithManager(mgr ctrl.Manager) error {
241236
For(&kueue.Workload{}).
242237
Named("workload").
243238
WithEventFilter(r).
244-
Watches(&jobset.JobSet{}, &jobSetHandler{r: r}).
239+
Watches(&corev1.Pod{}, &podHandler{r: r}).
245240
Complete(r)
246241
}
247242

248243
var _ predicate.Predicate = (*WorkloadReconciler)(nil)
249244

250245
func (r *WorkloadReconciler) Create(e event.CreateEvent) bool {
251-
return r.handle(e.Object)
246+
return r.handleEvent(e.Object)
252247
}
253248

254-
func (r *WorkloadReconciler) Delete(event.DeleteEvent) bool {
255-
// Nothing handle for Delete event.
256-
return false
249+
func (r *WorkloadReconciler) Delete(e event.DeleteEvent) bool {
250+
return r.handleEvent(e.Object)
257251
}
258252

259253
func (r *WorkloadReconciler) Update(e event.UpdateEvent) bool {
260-
return r.handle(e.ObjectNew)
254+
return r.handleEvent(e.ObjectNew)
261255
}
262256

263257
func (r *WorkloadReconciler) Generic(event.GenericEvent) bool {
264258
// Nothing handle for Generic event.
265259
return false
266260
}
267261

268-
func (r *WorkloadReconciler) handle(obj client.Object) bool {
262+
func (r *WorkloadReconciler) handleEvent(obj client.Object) bool {
269263
wl, isWorkload := obj.(*kueue.Workload)
270264
if !isWorkload {
271265
return true
272266
}
273-
return r.handleWorkload(wl)
267+
return r.shouldHandleWorkload(wl)
274268
}
275269

276-
func (r *WorkloadReconciler) handleWorkload(wl *kueue.Workload) bool {
270+
func (r *WorkloadReconciler) shouldHandleWorkload(wl *kueue.Workload) bool {
277271
// We should handle all Workloads that have the cleanup slice finalizer.
278272
return controllerutil.ContainsFinalizer(wl, CleanupSliceFinalizerName) || !r.shouldFinalize(wl)
279273
}
280274

281-
var _ handler.EventHandler = (*jobSetHandler)(nil)
275+
var _ handler.EventHandler = (*podHandler)(nil)
282276

283-
type jobSetHandler struct {
277+
type podHandler struct {
284278
r *WorkloadReconciler
285279
}
286280

287-
func (h *jobSetHandler) Generic(context.Context, event.GenericEvent, workqueue.TypedRateLimitingInterface[reconcile.Request]) {
281+
func (h *podHandler) Generic(context.Context, event.GenericEvent, workqueue.TypedRateLimitingInterface[reconcile.Request]) {
288282
}
289283

290-
func (h *jobSetHandler) Create(_ context.Context, e event.CreateEvent, q workqueue.TypedRateLimitingInterface[reconcile.Request]) {
291-
h.handle(e.Object, q)
284+
func (h *podHandler) Create(context.Context, event.CreateEvent, workqueue.TypedRateLimitingInterface[reconcile.Request]) {
292285
}
293286

294-
func (h *jobSetHandler) Update(_ context.Context, e event.UpdateEvent, q workqueue.TypedRateLimitingInterface[reconcile.Request]) {
295-
h.handle(e.ObjectNew, q)
287+
func (h *podHandler) Delete(ctx context.Context, e event.DeleteEvent, q workqueue.TypedRateLimitingInterface[reconcile.Request]) {
288+
h.handleEvent(ctx, e.Object, q)
296289
}
297290

298-
func (h *jobSetHandler) Delete(context.Context, event.DeleteEvent, workqueue.TypedRateLimitingInterface[reconcile.Request]) {
291+
func (h *podHandler) Update(context.Context, event.UpdateEvent, workqueue.TypedRateLimitingInterface[reconcile.Request]) {
299292
}
300293

301-
func (h *jobSetHandler) handle(obj client.Object, q workqueue.TypedRateLimitingInterface[reconcile.Request]) {
302-
jobSet, isJobSet := obj.(*jobset.JobSet)
303-
if !isJobSet {
294+
func (h *podHandler) handleEvent(ctx context.Context, obj client.Object, q workqueue.TypedRateLimitingInterface[reconcile.Request]) {
295+
pod, isPod := obj.(*corev1.Pod)
296+
// Only Pods with the TAS label should be handled.
297+
if !isPod || pod.Labels[kueuealpha.TASLabel] != tasLabelValue {
304298
return
305299
}
306300

307-
ctx := context.Background()
301+
jobSetName := pod.Labels[jobset.JobSetNameKey]
302+
// Only pods owned by a JobSet should be handled.
303+
if jobSetName == "" {
304+
return
305+
}
306+
307+
log := ctrl.LoggerFrom(ctx).WithValues(
308+
"pod", klog.KObj(pod),
309+
"jobSet", klog.ObjectRef{Name: jobSetName, Namespace: pod.Namespace},
310+
)
311+
ctrl.LoggerInto(ctx, log)
312+
313+
log.V(5).Info("Handle Pod event")
314+
308315
workloads := &kueue.WorkloadList{}
309316
opts := []client.ListOption{
310-
client.InNamespace(jobSet.Namespace),
311-
client.MatchingFields{indexer.OwnerReferenceUID: string(jobSet.GetUID())},
317+
client.InNamespace(pod.Namespace),
318+
client.MatchingFields{OwnerReferenceName: jobSetName},
312319
}
313320
err := h.r.client.List(ctx, workloads, opts...)
314321
if err != nil {
315-
h.r.log.Error(err, "failed to list workloads", "jobset", klog.KObj(jobSet))
322+
log.Error(err, "Failed to list workloads")
323+
return
324+
}
325+
326+
if len(workloads.Items) == 0 {
327+
log.V(5).Info("No Workloads found for the JobSet – skipping handling")
316328
return
317329
}
318330

319331
for _, wl := range workloads.Items {
320-
if h.r.handleWorkload(&wl) {
332+
if h.r.shouldHandleWorkload(&wl) {
321333
req := reconcile.Request{
322334
NamespacedName: types.NamespacedName{
323335
Name: wl.Name,

0 commit comments

Comments
 (0)