Skip to content

Commit d5a2b2e

Browse files
authored
Add support for shared TPU reservations (#823)
1 parent 9246bb3 commit d5a2b2e

File tree

4 files changed

+285
-9
lines changed

4 files changed

+285
-9
lines changed

goldens.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ goldens:
99
command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --spot --cpu-limit=1 --memory-limit=1Mi --dry-run
1010
"Cluster create with CPU and memory limits above capacity":
1111
command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --spot --cpu-limit=20 --memory-limit=1Gi --dry-run
12+
"Cluster create with shared reservation":
13+
command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --reservation=projects/reservation-project/reservations/golden-reservation --dry-run
1214
"Cluster create with gb200-4":
1315
command: xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --device-type=gb200-4 --reservation=golden-reservation --dry-run
1416
"Cluster create private":
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
$ xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-8 --reservation=projects/reservation-project/reservations/golden-reservation --dry-run
2+
[XPK] Starting xpk v0.14.3
3+
[XPK] Starting cluster create for cluster golden-cluster:
4+
[XPK] Working on golden-project and us-central1-a
5+
[XPK] Task: `Determine server supported GKE versions for default rapid gke version` is implemented by the following command not running since it is a dry run.
6+
gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)"
7+
[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run.
8+
gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.validVersions)"
9+
[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run.
10+
gcloud container clusters list --project=golden-project --filter=location~"us-central1.*" --format="csv[no-heading](name)"
11+
[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run.
12+
gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --location-policy=BALANCED --scopes=storage-full,gke-default
13+
[XPK] Task: `Find cluster region or zone` is implemented by the following command not running since it is a dry run.
14+
gcloud container clusters list --project=golden-project --filter=name=golden-cluster --format="value(location)"
15+
[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run.
16+
gcloud container clusters describe golden-cluster --project=golden-project --location=us-central1 --format="value(privateClusterConfig.enablePrivateNodes)"
17+
[XPK] Private Nodes is not enabled on the cluster.
18+
[XPK] Cluster is public and no need to authorize networks.
19+
[XPK] Try 1: get-credentials-dns-endpoint to cluster golden-cluster
20+
[XPK] Task: `get-credentials-dns-endpoint to cluster golden-cluster` is implemented by the following command not running since it is a dry run.
21+
gcloud container clusters get-credentials golden-cluster --location=us-central1 --dns-endpoint --project=golden-project && kubectl config view && kubectl config set-context --current --namespace=default
22+
[XPK] Testing credentials with kubectl...
23+
[XPK] Task: `kubectl get pods` is implemented by the following command not running since it is a dry run.
24+
kubectl get pods
25+
[XPK] Credentials test succeeded.
26+
[XPK] Finished get-credentials and kubectl setup.
27+
[XPK] Task: 'Checking CoreDNS deployment existence' in progress for namespace: kube-system
28+
[XPK] Task: `Check CoreDNS deployment in kube-system` is implemented by the following command not running since it is a dry run.
29+
kubectl get deployment coredns -n kube-system
30+
[XPK] Now verifying CoreDNS readiness...
31+
[XPK] Task: `Waiting for kubeDNS to be checked.` is implemented by the following command not running since it is a dry run.
32+
kubectl get deployment kube-dns -n kube-system --ignore-not-found
33+
[XPK] kube-dns deployment not found.
34+
[XPK] Verifying if CoreDNS is available...
35+
[XPK] Task: `Wait for coredns available` is implemented by the following command not running since it is a dry run.
36+
kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-system --timeout=240s
37+
[XPK] CoreDNS has successfully started and passed verification.
38+
[XPK] CoreDNS deployment 'coredns' found in namespace 'kube-system'.
39+
[XPK] Skipping CoreDNS deployment since it already exists.
40+
[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
41+
gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)"
42+
[XPK] Creating 1 node pool or pools of tpu7x-8
43+
We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, requires_workload_policy=True)
44+
[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
45+
gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
46+
[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run.
47+
gcloud beta compute reservations describe golden-reservation --project=reservation-project --zone=us-central1-a
48+
[XPK] Creating 1 node pool or pools of tpu7x-8
49+
Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, requires_workload_policy=True)
50+
[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run.
51+
gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)"
52+
[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
53+
kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true
54+
[XPK] Existing node pool names ['0']
55+
[XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run.
56+
gcloud compute resource-policies describe tpu7x-8-2x2x1-placement-policy --project=golden-project --region=us-central1
57+
[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=projects/reservation-project/reservations/golden-reservation --placement-policy=tpu7x-8-2x2x1-placement-policy --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15
58+
[XPK] Breaking up a total of 1 commands into 1 batches
59+
[XPK] Pretending all the jobs succeeded
60+
[XPK] Create or delete node pool request complete.
61+
[XPK] Creating ConfigMap for cluster
62+
[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run.
63+
gcloud beta compute reservations describe golden-reservation --project=reservation-project --zone=us-central1-a
64+
[XPK] Breaking up a total of 2 commands into 1 batches
65+
[XPK] Pretending all the jobs succeeded
66+
[XPK] Enabling the jobset API on our cluster, to be deprecated when Jobset is globally available
67+
[XPK] Try 1: Install Jobset on golden-cluster
68+
[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run.
69+
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml
70+
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
71+
kubectl get node --no-headers | wc -l
72+
[XPK] Try 1: Updating jobset Controller Manager resources
73+
[XPK] Task: `Updating jobset Controller Manager resources` is implemented by the following command not running since it is a dry run.
74+
kubectl apply -f 1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95
75+
[XPK] Try 1: Install PathwaysJob on golden-cluster
76+
[XPK] Task: `Install PathwaysJob on golden-cluster` is implemented by the following command not running since it is a dry run.
77+
kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/v0.1.4/install.yaml
78+
[XPK] Enabling Kueue on the cluster
79+
[XPK] Task: `Get kueue version on server` is implemented by the following command not running since it is a dry run.
80+
kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.spec.template.spec.containers[0].image}'
81+
[XPK] Installing Kueue version v0.14.3...
82+
[XPK] Try 1: Install Kueue
83+
[XPK] Task: `Install Kueue` is implemented by the following command not running since it is a dry run.
84+
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml
85+
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
86+
kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m
87+
[XPK] Applying following Kueue resources:
88+
apiVersion: kueue.x-k8s.io/v1beta1
89+
kind: ResourceFlavor
90+
metadata:
91+
name: "1xtpu7x-8"
92+
spec:
93+
nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu7x", "cloud.google.com/gke-tpu-topology": "2x2x1"}
94+
95+
---
96+
97+
apiVersion: kueue.x-k8s.io/v1beta1
98+
kind: AdmissionCheck
99+
metadata:
100+
name: dws-prov
101+
spec:
102+
controllerName: kueue.x-k8s.io/provisioning-request
103+
parameters:
104+
apiGroup: kueue.x-k8s.io
105+
kind: ProvisioningRequestConfig
106+
name: dws-config
107+
---
108+
apiVersion: kueue.x-k8s.io/v1beta1
109+
kind: ProvisioningRequestConfig
110+
metadata:
111+
name: dws-config
112+
spec:
113+
provisioningClassName: queued-provisioning.gke.io
114+
podSetUpdates:
115+
nodeSelector:
116+
- key: autoscaling.gke.io/provisioning-request
117+
valueFromProvisioningClassDetail: ResizeRequestName
118+
managedResources:
119+
- google.com/tpu
120+
---
121+
apiVersion: kueue.x-k8s.io/v1beta1
122+
kind: ClusterQueue
123+
metadata:
124+
name: "cluster-queue"
125+
spec:
126+
preemption:
127+
reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
128+
withinClusterQueue: LowerPriority
129+
namespaceSelector: {} # match all.
130+
resourceGroups: [{'coveredResources': ['google.com/tpu'], 'flavors': [{'name': '1xtpu7x-8', 'resources': [{'name': 'google.com/tpu', 'nominalQuota': 4}]}]}]
131+
132+
---
133+
apiVersion: kueue.x-k8s.io/v1beta1
134+
kind: LocalQueue
135+
metadata:
136+
namespace: default
137+
name: multislice-queue
138+
spec:
139+
clusterQueue: cluster-queue
140+
---
141+
apiVersion: scheduling.k8s.io/v1
142+
kind: PriorityClass
143+
metadata:
144+
name: very-low
145+
value: 100
146+
globalDefault: false
147+
description: "Very Low"
148+
---
149+
apiVersion: scheduling.k8s.io/v1
150+
kind: PriorityClass
151+
metadata:
152+
name: low
153+
value: 250
154+
globalDefault: false
155+
description: "Low"
156+
---
157+
apiVersion: scheduling.k8s.io/v1
158+
kind: PriorityClass
159+
metadata:
160+
name: medium
161+
value: 500
162+
globalDefault: false
163+
description: "Medium"
164+
---
165+
apiVersion: scheduling.k8s.io/v1
166+
kind: PriorityClass
167+
metadata:
168+
name: high
169+
value: 750
170+
globalDefault: false
171+
description: "High"
172+
---
173+
apiVersion: scheduling.k8s.io/v1
174+
kind: PriorityClass
175+
metadata:
176+
name: very-high
177+
value: 1000
178+
globalDefault: false
179+
description: "Very High"
180+
[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run.
181+
kubectl apply -f ce52d2868b681f478f3f12e5696b1609e68b442a32f7f82603ba7064b825cf4f
182+
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
183+
kubectl get node --no-headers | wc -l
184+
[XPK] Try 1: Updating Kueue Controller Manager resources
185+
[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
186+
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
187+
[XPK] Verifying kjob installation
188+
[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run.
189+
kubectl-kjob help
190+
[XPK] kjob found
191+
[XPK] Applying kjob CDRs
192+
[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run.
193+
kubectl kjob printcrds | kubectl apply --server-side -f -
194+
[XPK] Creating kjob CRDs succeeded
195+
[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
196+
kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true
197+
[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run.
198+
kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61
199+
[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run.
200+
kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8
201+
[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run.
202+
kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486
203+
[XPK] GKE commands done! Resources are created.
204+
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
205+
[XPK] Exiting XPK cleanly

src/xpk/core/capacity.py

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -115,9 +115,12 @@ def get_reservation_maintenance_interval(
115115
Returns:
116116
0 if successful and 1 otherwise.
117117
"""
118+
reservation_project, reservation_name = get_reservation_project_and_name(
119+
reservation, project
120+
)
118121
command = (
119-
f'gcloud beta compute reservations describe {reservation}'
120-
f' --project={project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
122+
f'gcloud beta compute reservations describe {reservation_name}'
123+
f' --project={reservation_project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
121124
)
122125
return_code, output = run_command_for_value(
123126
command, 'Get reservation maintenance interval'
@@ -139,9 +142,12 @@ def get_reservation_placement_policy(
139142
Returns:
140143
0 if successful and 1 otherwise.
141144
"""
145+
reservation_project, reservation_name = get_reservation_project_and_name(
146+
reservation, project
147+
)
142148
command = (
143-
f'gcloud beta compute reservations describe {reservation}'
144-
f' --project={project} --zone={zone} --format="value(resourcePolicies.policy)"'
149+
f'gcloud beta compute reservations describe {reservation_name}'
150+
f' --project={reservation_project} --zone={zone} --format="value(resourcePolicies.policy)"'
145151
)
146152
return_code, output = run_command_for_value(
147153
command, 'Get reservation placement policy'
@@ -156,9 +162,12 @@ def get_reservation_deployment_type(
156162
reservation: str, zone: str, project: str
157163
) -> str:
158164
"""Get reservation deployment type."""
165+
reservation_project, reservation_name = get_reservation_project_and_name(
166+
reservation, project
167+
)
159168
command = (
160-
f'gcloud beta compute reservations describe {reservation}'
161-
f' --project={project} --zone={zone} --format="value(deploymentType)"'
169+
f'gcloud beta compute reservations describe {reservation_name}'
170+
f' --project={reservation_project} --zone={zone} --format="value(deploymentType)"'
162171
)
163172
return_code, output = run_command_for_value(
164173
command, 'Get reservation deployment type', dry_run_return_val='DENSE'
@@ -178,9 +187,12 @@ def verify_reservation_exists(args) -> int:
178187
Returns:
179188
0 if successful and 1 otherwise.
180189
"""
190+
reservation_project, reservation_name = get_reservation_project_and_name(
191+
args.reservation, args.project
192+
)
181193
command = (
182-
f'gcloud beta compute reservations describe {args.reservation}'
183-
f' --project={args.project} --zone={args.zone}'
194+
f'gcloud beta compute reservations describe {reservation_name}'
195+
f' --project={reservation_project} --zone={args.zone}'
184196
)
185197
return_code = run_command_with_updates(command, 'Describe reservation')
186198
if return_code != 0:
@@ -264,3 +276,29 @@ def get_capacity_node_selectors_from_capacity_type(
264276
)
265277
return_code = 1
266278
return node_selector, return_code
279+
280+
281+
def get_reservation_project_and_name(
282+
reservation_name_or_path: str, cluster_project: str
283+
) -> tuple[str, str]:
284+
"""Get the reservation project and name.
285+
286+
Args:
287+
reservation_name_or_path: either reservation name or reservation path in format
288+
projects/RESERVATION_PROJECT_ID/reservations/RESERVATION_NAME
289+
cluster_project: the cluster project
290+
291+
Returns:
292+
Tuple with reservation project and reservation name.
293+
"""
294+
if '/' not in reservation_name_or_path:
295+
return cluster_project, reservation_name_or_path
296+
reservation_parts = reservation_name_or_path.split('/')
297+
if (
298+
len(reservation_parts) != 4
299+
or reservation_parts[0] != 'projects'
300+
or reservation_parts[2] != 'reservations'
301+
):
302+
xpk_print('Unable to parse reservation: ', reservation_name_or_path)
303+
xpk_exit(1)
304+
return reservation_parts[1], reservation_parts[3]

src/xpk/core/capacity_test.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
import pytest
1818
from unittest.mock import MagicMock, patch
19-
from .capacity import get_reservation_deployment_type
19+
from .capacity import get_reservation_deployment_type, get_reservation_project_and_name
2020

2121

2222
@patch('xpk.core.capacity.xpk_print')
@@ -48,3 +48,34 @@ def test_get_reservation_deployment_type_returns_deployment_type_when_command_su
4848
reservation='reservation', zone='zone', project='project'
4949
)
5050
assert result == 'DENSE'
51+
52+
53+
def test_get_reservation_project_and_name_parses_local_reservation():
54+
project, name = get_reservation_project_and_name(
55+
'test-reservation', 'cluster-project'
56+
)
57+
58+
assert project == 'cluster-project'
59+
assert name == 'test-reservation'
60+
61+
62+
def test_get_reservation_project_and_name_parses_shared_reservation():
63+
project, name = get_reservation_project_and_name(
64+
'projects/reservation-project/reservations/test-reservation',
65+
'cluster-project',
66+
)
67+
68+
assert project == 'reservation-project'
69+
assert name == 'test-reservation'
70+
71+
72+
@patch('xpk.core.capacity.xpk_print')
73+
def test_get_reservation_project_and_name_fails_for_invalid_reservation(
74+
xpk_print: MagicMock, mocker
75+
):
76+
with pytest.raises(SystemExit):
77+
get_reservation_project_and_name(
78+
'invalid/reservation',
79+
'cluster-project',
80+
)
81+
assert 'Unable to parse reservation' in xpk_print.mock_calls[0].args[0]

0 commit comments

Comments
 (0)