Skip to content

Commit 25fe399

Browse files
sharabianigcie
andauthored
"Select TPU by topology (#525)" + Fix errors (#563)
* fix yaml formatting for workloads with TPU and NAP * refactor tpu system characteristics internal representation of TPU machines has changed, so now grep that used the old format fails * fix device_type issue --------- Co-authored-by: gcie <gciesielski@google.com>
1 parent 14b33f2 commit 25fe399

File tree

5 files changed

+172
-1086
lines changed

5 files changed

+172
-1086
lines changed

.github/workflows/build_tests.yaml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ jobs:
4040
group-name: ${{ steps.set-group-name.outputs.group-name }}
4141
zone: ${{ steps.set-zone.outputs.zone }}
4242
tpu-type: ${{ steps.set-tpu-type.outputs.tpu-type }}
43+
tpu-type-topology: ${{ steps.set-tpu-type-topology.outputs.tpu-type-topology }}
4344
location: ${{steps.set-location.outputs.location}}
4445
run-id: ${{steps.set-run-id.outputs.run-id}}
4546
steps:
@@ -76,6 +77,10 @@ jobs:
7677
id: set-tpu-type
7778
run: |
7879
echo tpu-type=v4-8 >> $GITHUB_OUTPUT
80+
- name: set tpu-type-topology
81+
id: set-tpu-type-topology
82+
run: |
83+
echo tpu-type-topology=v4-2x2x1 >> $GITHUB_OUTPUT
7984
- name: set location
8085
id: set-location
8186
run: |
@@ -152,7 +157,7 @@ jobs:
152157
with:
153158
run-id: '${{needs.set-variables.outputs.run-id}}'
154159
cluster-name: '${{needs.set-variables.outputs.cluster-name}}'
155-
tpu-type: '${{needs.set-variables.outputs.tpu-type || inputs.tpu-type}}'
160+
tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}'
156161
zone: '${{needs.set-variables.outputs.zone}}'
157162
location: '${{needs.set-variables.outputs.location}}'
158163
secrets: inherit
@@ -165,7 +170,7 @@ jobs:
165170
with:
166171
cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}'
167172
cluster-name: '${{needs.set-variables.outputs.cluster-name}}'
168-
tpu-type: '${{needs.set-variables.outputs.tpu-type || inputs.tpu-type}}'
173+
tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}'
169174
zone: '${{needs.set-variables.outputs.zone}}'
170175
location: '${{needs.set-variables.outputs.location}}'
171176
run-id: '${{needs.set-variables.outputs.run-id}}'
@@ -180,6 +185,7 @@ jobs:
180185
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
181186
cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}'
182187
tpu-type: ${{needs.set-variables.outputs.tpu-type}}
188+
tpu-type-topology: ${{needs.set-variables.outputs.tpu-type-topology}}
183189
zone: ${{needs.set-variables.outputs.zone}}
184190
run-id: '${{needs.set-variables.outputs.run-id}}'
185191
secrets: inherit

.github/workflows/reusable_workload_tests.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ on:
2424
tpu-type:
2525
required: true
2626
type: string
27+
tpu-type-topology:
28+
required: true
29+
type: string
2730
tpu-type-dws:
2831
required: false
2932
type: string
@@ -108,7 +111,7 @@ jobs:
108111
--docker-password='${{secrets.GCP_SA_KEY}}' \
109112
--docker-email='${{secrets.GCP_SA_EMAIL}}'
110113
- name: Run workload with private image
111-
run: python xpk.py workload create --cluster ${{inputs.cluster-name}} --workload $PRIVATE_IMAGE_WORKLOAD_NAME --command "echo foo" --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --docker-image=${{secrets.DOCKER_REPO_SERVER}}ubuntu2004 --docker-image-pull-secret=gcr-key
114+
run: python xpk.py workload create --cluster ${{inputs.cluster-name}} --workload $PRIVATE_IMAGE_WORKLOAD_NAME --command "echo foo" --tpu-type=${{inputs.tpu-type-topology}} --num-slices=1 --zone=${{inputs.zone}} --docker-image=${{secrets.DOCKER_REPO_SERVER}}ubuntu2004 --docker-image-pull-secret=gcr-key
112115
- name: Wait for private image workload completion and confirm it succeeded
113116
run: python3 xpk.py workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $PRIVATE_IMAGE_WORKLOAD_NAME --timeout 300
114117
- name: Delete kubectl secret

src/xpk/core/capacity.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,9 +232,9 @@ def get_capacity_node_selectors_from_capacity_type(
232232
case CapacityType.ON_DEMAND.name:
233233
node_selector = ''
234234
case CapacityType.FLEX_START.name:
235-
node_selector = 'cloud.google.com/gke-queued="true"'
235+
node_selector = 'cloud.google.com/gke-queued: "true"'
236236
case CapacityType.SPOT.name:
237-
node_selector = 'cloud.google.com/gke-spot="true"'
237+
node_selector = 'cloud.google.com/gke-spot: "true"'
238238
case CapacityType.RESERVATION.name:
239239
node_selector = f'cloud.google.com/reservation-name: {args.reservation}'
240240
case _:

src/xpk/core/scheduling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
4949
missing_gke_accelerator_type = False
5050
if not cluster_config_map.get(system.gke_accelerator):
5151
xpk_print(
52-
f'Gke Accelerator Type Check: {args.workload} is requesting'
52+
f'GKE Accelerator Type Check: {args.workload} is requesting'
5353
f' {system.gke_accelerator} but cluster only contains'
5454
f' {cluster_config_map.keys()}. '
5555
)

0 commit comments

Comments
 (0)