Skip to content

Commit 0cafddc

Browse files
authored
[slice] Parse cloud.google.com/gke-tpu-topology annotation for podSet size.
2 parents 690022a + 9316c60 commit 0cafddc

File tree

12 files changed

+554
-267
lines changed

12 files changed

+554
-267
lines changed

slice/go.mod

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,19 @@ module tpu-slice-controller
33
go 1.24.0
44

55
require (
6+
github.com/go-logr/logr v1.4.3
7+
github.com/google/go-cmp v0.7.0
68
github.com/onsi/ginkgo/v2 v2.23.4
79
github.com/onsi/gomega v1.37.0
810
github.com/open-policy-agent/cert-controller v0.13.0
9-
k8s.io/apimachinery v0.33.2
10-
k8s.io/client-go v0.33.2
11+
go.uber.org/zap v1.27.0
12+
k8s.io/api v0.33.3
13+
k8s.io/apimachinery v0.33.3
14+
k8s.io/client-go v0.33.3
15+
k8s.io/utils v0.0.0-20250604170112-4c0f3b243397
1116
sigs.k8s.io/controller-runtime v0.21.0
12-
sigs.k8s.io/jobset v0.8.1
13-
)
14-
15-
require (
16-
github.com/kubeflow/mpi-operator v0.6.0 // indirect
17-
github.com/kubeflow/training-operator v1.9.2 // indirect
18-
github.com/project-codeflare/appwrapper v1.1.2 // indirect
19-
github.com/ray-project/kuberay/ray-operator v1.3.2 // indirect
20-
github.com/sirupsen/logrus v1.9.3 // indirect
21-
go.uber.org/atomic v1.11.0 // indirect
17+
sigs.k8s.io/jobset v0.8.2
18+
sigs.k8s.io/kueue v0.13.0-rc.0
2219
)
2320

2421
require (
@@ -34,7 +31,6 @@ require (
3431
github.com/felixge/httpsnoop v1.0.4 // indirect
3532
github.com/fsnotify/fsnotify v1.9.0 // indirect
3633
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
37-
github.com/go-logr/logr v1.4.3
3834
github.com/go-logr/stdr v1.2.2 // indirect
3935
github.com/go-logr/zapr v1.3.0 // indirect
4036
github.com/go-openapi/jsonpointer v0.21.1 // indirect
@@ -45,22 +41,26 @@ require (
4541
github.com/google/btree v1.1.3 // indirect
4642
github.com/google/cel-go v0.23.2 // indirect
4743
github.com/google/gnostic-models v0.6.9 // indirect
48-
github.com/google/go-cmp v0.7.0
4944
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect
5045
github.com/google/uuid v1.6.0 // indirect
5146
github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 // indirect
5247
github.com/inconshreveable/mousetrap v1.1.0 // indirect
5348
github.com/josharian/intern v1.0.0 // indirect
5449
github.com/json-iterator/go v1.1.12 // indirect
50+
github.com/kubeflow/mpi-operator v0.6.0 // indirect
51+
github.com/kubeflow/training-operator v1.9.2 // indirect
5552
github.com/mailru/easyjson v0.9.0 // indirect
5653
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
5754
github.com/modern-go/reflect2 v1.0.2 // indirect
5855
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
5956
github.com/pkg/errors v0.9.1 // indirect
57+
github.com/project-codeflare/appwrapper v1.1.2 // indirect
6058
github.com/prometheus/client_golang v1.22.0 // indirect
6159
github.com/prometheus/client_model v0.6.2 // indirect
6260
github.com/prometheus/common v0.63.0 // indirect
6361
github.com/prometheus/procfs v0.16.0 // indirect
62+
github.com/ray-project/kuberay/ray-operator v1.3.2 // indirect
63+
github.com/sirupsen/logrus v1.9.3 // indirect
6464
github.com/spf13/cobra v1.9.1 // indirect
6565
github.com/spf13/pflag v1.0.6 // indirect
6666
github.com/stoewer/go-strcase v1.3.0 // indirect
@@ -74,9 +74,10 @@ require (
7474
go.opentelemetry.io/otel/sdk v1.33.0 // indirect
7575
go.opentelemetry.io/otel/trace v1.35.0 // indirect
7676
go.opentelemetry.io/proto/otlp v1.4.0 // indirect
77+
go.uber.org/atomic v1.11.0 // indirect
7778
go.uber.org/automaxprocs v1.6.0 // indirect
7879
go.uber.org/multierr v1.11.0 // indirect
79-
go.uber.org/zap v1.27.0
80+
go.yaml.in/yaml/v2 v2.4.2 // indirect
8081
golang.org/x/exp v0.0.0-20250305212735-054e65f0b394 // indirect
8182
golang.org/x/net v0.38.0 // indirect
8283
golang.org/x/oauth2 v0.28.0 // indirect
@@ -94,18 +95,15 @@ require (
9495
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
9596
gopkg.in/inf.v0 v0.9.1 // indirect
9697
gopkg.in/yaml.v3 v3.0.1 // indirect
97-
k8s.io/api v0.33.2
9898
k8s.io/apiextensions-apiserver v0.33.1 // indirect
9999
k8s.io/apiserver v0.33.2 // indirect
100100
k8s.io/component-base v0.33.2 // indirect
101101
k8s.io/component-helpers v0.33.2 // indirect
102102
k8s.io/klog/v2 v2.130.1 // indirect
103103
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
104-
k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e
105104
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect
106105
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
107-
sigs.k8s.io/kueue v0.13.0-devel.0.20250623150000-7149234c4989
108106
sigs.k8s.io/randfill v1.0.0 // indirect
109107
sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect
110-
sigs.k8s.io/yaml v1.4.0 // indirect
108+
sigs.k8s.io/yaml v1.5.0 // indirect
111109
)

slice/go.sum

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,10 @@ go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
181181
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
182182
go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8=
183183
go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
184+
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
185+
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
186+
go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE=
187+
go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI=
184188
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
185189
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
186190
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
@@ -245,16 +249,16 @@ gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
245249
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
246250
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
247251
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
248-
k8s.io/api v0.33.2 h1:YgwIS5jKfA+BZg//OQhkJNIfie/kmRsO0BmNaVSimvY=
249-
k8s.io/api v0.33.2/go.mod h1:fhrbphQJSM2cXzCWgqU29xLDuks4mu7ti9vveEnpSXs=
252+
k8s.io/api v0.33.3 h1:SRd5t//hhkI1buzxb288fy2xvjubstenEKL9K51KBI8=
253+
k8s.io/api v0.33.3/go.mod h1:01Y/iLUjNBM3TAvypct7DIj0M0NIZc+PzAHCIo0CYGE=
250254
k8s.io/apiextensions-apiserver v0.33.1 h1:N7ccbSlRN6I2QBcXevB73PixX2dQNIW0ZRuguEE91zI=
251255
k8s.io/apiextensions-apiserver v0.33.1/go.mod h1:uNQ52z1A1Gu75QSa+pFK5bcXc4hq7lpOXbweZgi4dqA=
252-
k8s.io/apimachinery v0.33.2 h1:IHFVhqg59mb8PJWTLi8m1mAoepkUNYmptHsV+Z1m5jY=
253-
k8s.io/apimachinery v0.33.2/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM=
256+
k8s.io/apimachinery v0.33.3 h1:4ZSrmNa0c/ZpZJhAgRdcsFcZOw1PQU1bALVQ0B3I5LA=
257+
k8s.io/apimachinery v0.33.3/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM=
254258
k8s.io/apiserver v0.33.2 h1:KGTRbxn2wJagJowo29kKBp4TchpO1DRO3g+dB/KOJN4=
255259
k8s.io/apiserver v0.33.2/go.mod h1:9qday04wEAMLPWWo9AwqCZSiIn3OYSZacDyu/AcoM/M=
256-
k8s.io/client-go v0.33.2 h1:z8CIcc0P581x/J1ZYf4CNzRKxRvQAwoAolYPbtQes+E=
257-
k8s.io/client-go v0.33.2/go.mod h1:9mCgT4wROvL948w6f6ArJNb7yQd7QsvqavDeZHvNmHo=
260+
k8s.io/client-go v0.33.3 h1:M5AfDnKfYmVJif92ngN532gFqakcGi6RvaOF16efrpA=
261+
k8s.io/client-go v0.33.3/go.mod h1:luqKBQggEf3shbxHY4uVENAxrDISLOarxpTKMiUuujg=
258262
k8s.io/component-base v0.33.2 h1:sCCsn9s/dG3ZrQTX/Us0/Sx2R0G5kwa0wbZFYoVp/+0=
259263
k8s.io/component-base v0.33.2/go.mod h1:/41uw9wKzuelhN+u+/C59ixxf4tYQKW7p32ddkYNe2k=
260264
k8s.io/component-helpers v0.33.2 h1:AjCtYzst11NV8ensxV/2LEEXRwctqS7Bs44bje9Qcnw=
@@ -265,24 +269,25 @@ k8s.io/kube-aggregator v0.33.1 h1:PigQUqAvd6Y4hBjQAqhKz3lEJC2VHLL4bSOEuS06a40=
265269
k8s.io/kube-aggregator v0.33.1/go.mod h1:16/wlU5Lj7hNJSv7JSu5FLvxyrgiJVLCHzfVoECAsuI=
266270
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUyGcf03XZEP0ZIKgKj35LS4=
267271
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8=
268-
k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e h1:KqK5c/ghOm8xkHYhlodbp6i6+r+ChV2vuAuVRdFbLro=
269-
k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
272+
k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y=
273+
k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
270274
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM=
271275
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw=
272276
sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8=
273277
sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM=
274-
sigs.k8s.io/jobset v0.8.1 h1:FwosQQyjmYgOKy8sD3ERpfil1BDLNpYUOJdIqmZSz3s=
275-
sigs.k8s.io/jobset v0.8.1/go.mod h1:yitjuGOExl2p964nhyevQGIkfiPSRHcdC3zNBneKCT8=
278+
sigs.k8s.io/jobset v0.8.2 h1:WC5a5G7MqfJJy4p+6OxGMpfbB90KoDSay96Mc4yMMZM=
279+
sigs.k8s.io/jobset v0.8.2/go.mod h1:yitjuGOExl2p964nhyevQGIkfiPSRHcdC3zNBneKCT8=
276280
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE=
277281
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
278-
sigs.k8s.io/kueue v0.13.0-devel.0.20250623150000-7149234c4989 h1:JzJvHdmpsS7lSdtmfPfbsmQ+ai8ggFCOFI4P5yiLW8A=
279-
sigs.k8s.io/kueue v0.13.0-devel.0.20250623150000-7149234c4989/go.mod h1:MjpW259zOAhEM1Iv87tHge7zPUsypLEZ7sdxa+7HWrc=
282+
sigs.k8s.io/kueue v0.13.0-rc.0 h1:5n/JzaisYW3JvhGr1+2DbU00GnYG0cKrldZOqf2KOK4=
283+
sigs.k8s.io/kueue v0.13.0-rc.0/go.mod h1:anunCVwyBEVpmuZhNUK3XusE2vS1I0cTyKZzT7nr0qo=
280284
sigs.k8s.io/lws v0.6.2 h1:5ulPJDaLBI9zk6ayGO2Lfg9P/FBL3C1LsmHmJVqvHvo=
281285
sigs.k8s.io/lws v0.6.2/go.mod h1:7nbwcpHwdDticuWPTDe6Va5OpjasS0MoVeVD61N5Y0c=
282286
sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
283287
sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU=
284288
sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
285289
sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI=
286290
sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps=
287-
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
288291
sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=
292+
sigs.k8s.io/yaml v1.5.0 h1:M10b2U7aEUY6hRtU870n2VTPgR5RZiL/I6Lcc2F4NUQ=
293+
sigs.k8s.io/yaml v1.5.0/go.mod h1:wZs27Rbxoai4C0f8/9urLZtZtF3avA3gKvGyPdDqTO4=

slice/hack/kind-cluster.yaml

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,22 @@ nodes:
2323
v: "3"
2424
- role: worker
2525
labels:
26-
instance-type: on-demand
26+
cloud.google.com/gke-node-group: tas-group
27+
cloud.google.com/gke-tpu-accelerator: tpu-v7x
28+
cloud.google.com/gke-tpu-block: b1
29+
cloud.google.com/gke-tpu-subblock: sb1
2730
- role: worker
2831
labels:
29-
instance-type: spot
30-
32+
cloud.google.com/gke-node-group: tas-group
33+
cloud.google.com/gke-tpu-accelerator: tpu-v7x
34+
cloud.google.com/gke-tpu-block: b2
35+
cloud.google.com/gke-tpu-subblock: sb2
36+
- role: worker
37+
labels:
38+
cloud.google.com/gke-node-group: tas-group
39+
cloud.google.com/gke-tpu-accelerator: tpu-v7x
40+
cloud.google.com/gke-tpu-block: b2
41+
cloud.google.com/gke-tpu-subblock: sb3
3142
kubeadmConfigPatches:
3243
- |
3344
kind: JoinConfiguration

slice/internal/controller/workload_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ func (r *WorkloadReconciler) newSlice(wl *kueue.Workload) (*v1alpha1.Slice, erro
118118
}
119119
}
120120
slice.Spec.NodeSelector = map[string][]string{
121-
TPUReservationSubblockLabel: nodeSelectors.UnsortedList(),
121+
TPUReservationSubblockLabel: sets.List(nodeSelectors),
122122
}
123123
return slice, nil
124124
}

slice/internal/util/client/client.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
Copyright The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package client
18+
19+
import (
20+
"context"
21+
22+
"sigs.k8s.io/controller-runtime/pkg/client"
23+
)
24+
25+
func CreatePatch(before, after client.Object) (client.Patch, error) {
26+
patchBase := client.MergeFrom(before)
27+
patchBytes, err := patchBase.Data(after)
28+
if err != nil {
29+
return nil, err
30+
}
31+
return client.RawPatch(patchBase.Type(), patchBytes), nil
32+
}
33+
34+
// PatchStatus applies the merge patch of client.Object status.
35+
// The resourceVersion will be part of the patch, make this call fail if
36+
// client.Object was changed.
37+
func PatchStatus(ctx context.Context, c client.Client, obj client.Object, update func() (bool, error)) error {
38+
objOriginal := obj.DeepCopyObject().(client.Object)
39+
// Clearing ResourceVersion from the original object to make sure it is included in the generated patch.
40+
objOriginal.SetResourceVersion("")
41+
updated, err := update()
42+
if err != nil || !updated {
43+
return err
44+
}
45+
patch, err := CreatePatch(objOriginal, obj)
46+
if err != nil {
47+
return err
48+
}
49+
if err = c.Status().Patch(ctx, obj, patch); err != nil {
50+
return err
51+
}
52+
return nil
53+
}

slice/internal/util/testing/defaults.go

Lines changed: 0 additions & 29 deletions
This file was deleted.

slice/internal/util/testingjobs/jobset/wrappers.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ type ReplicatedJobRequirements struct {
4747
Labels map[string]string
4848
Annotations map[string]string
4949
PodAnnotations map[string]string
50+
NodeSelector map[string]string
5051
Image string
5152
Args []string
5253
}
@@ -72,6 +73,7 @@ func (j *JobSetWrapper) ReplicatedJobs(replicatedJobs ...ReplicatedJobRequiremen
7273
jt.Spec.Parallelism = ptr.To(req.Parallelism)
7374
jt.Spec.Completions = ptr.To(req.Completions)
7475
jt.Spec.Template.Annotations = req.PodAnnotations
76+
jt.Spec.Template.Spec.NodeSelector = req.NodeSelector
7577
if len(req.Image) > 0 {
7678
jt.Spec.BackoffLimit = ptr.To[int32](0)
7779
spec := &jt.Spec.Template.Spec

0 commit comments

Comments
 (0)