Skip to content

Commit ebae87d

Browse files
authored
Workload policy label (#817)
* feat: add workload policy label * update goldens
1 parent 844261f commit ebae87d

File tree

7 files changed

+141
-18
lines changed

7 files changed

+141
-18
lines changed

goldens/Workload_create.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current
2424
[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run.
2525
docker push gcr.io/golden-project/dry-run-runner:prefix-current
2626
[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run.
27-
kubectl apply -f abc33690f7a11b2ba50a8f949970fd3ba812f088367b7f64260729f01f41a231
27+
kubectl apply -f e21c8ebdc21d15a852187058c096898c486d3b1066e67dcfb67e5052a1d0a7fa
2828
[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run.
2929
gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error
3030
[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard.

goldens/Workload_create_pathways.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current
2626
[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run.
2727
docker push gcr.io/golden-project/dry-run-runner:prefix-current
2828
[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run.
29-
kubectl apply -f 574963d6d441695d681ff94ad241e713559f64b4ce519f4f1e0708c659f1c25d
29+
kubectl apply -f 6fb0f350cf4e0dccc71f77392d12db3de6371d5148519657046613794358bfce
3030
[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run.
3131
gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error
3232
[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard.

src/xpk/commands/workload.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@
6363
get_cpu_affinity,
6464
get_gpu_scheduler,
6565
create_sub_slicing_annotations,
66+
create_placement_policy_label,
67+
is_placement_policy_supported,
6668
)
6769
from ..core.storage import (
6870
GCE_PD_TYPE,
@@ -143,6 +145,7 @@
143145
nodeSelector:
144146
{accelerator_label}
145147
{machine_label}
148+
{placement_policy_label}
146149
{autoprovisioning_args}
147150
priorityClassName: {args.priority}
148151
hostNetwork: true
@@ -192,6 +195,8 @@
192195
{gpu_scheduler}
193196
priorityClassName: {args.priority}
194197
restartPolicy: Never
198+
nodeSelector:
199+
{placement_policy_label}
195200
imagePullSecrets:
196201
- name: {args.docker_image_pull_secret}
197202
hostNetwork: true
@@ -237,6 +242,8 @@
237242
spec:
238243
priorityClassName: {args.priority}
239244
restartPolicy: Never
245+
nodeSelector:
246+
{placement_policy_label}
240247
imagePullSecrets:
241248
- name: {args.docker_image_pull_secret}
242249
dnsPolicy: ClusterFirstWithHostNet
@@ -272,6 +279,7 @@
272279
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
273280
priorityClassName: {args.priority}
274281
nodeSelector:
282+
{placement_policy_label}
275283
{autoprovisioning_args}
276284
pathwaysDir: {args.pathways_gcs_location} #This bucket needs to be created in advance.
277285
controller:
@@ -517,6 +525,11 @@ def workload_create(args) -> None:
517525
failure_policy_rules=failure_policy_rules,
518526
pod_failure_policy=pod_failure_policy,
519527
annotations=annotations,
528+
placement_policy_label=(
529+
create_placement_policy_label(system)
530+
if is_placement_policy_supported(system)
531+
else ''
532+
),
520533
)
521534

522535
sub_networks = get_cluster_subnetworks()
@@ -541,6 +554,11 @@ def workload_create(args) -> None:
541554
service_account=service_account,
542555
failure_policy_rules=failure_policy_rules,
543556
pod_failure_policy=pod_failure_policy,
557+
placement_policy_label=(
558+
create_placement_policy_label(system)
559+
if is_placement_policy_supported(system)
560+
else ''
561+
),
544562
)
545563

546564
elif args.use_pathways and ensure_pathways_workload_prerequisites(
@@ -558,6 +576,11 @@ def workload_create(args) -> None:
558576
user_workload=get_user_workload_for_pathways(args, system),
559577
local_queue_name=LOCAL_QUEUE_NAME,
560578
autoprovisioning_args=autoprovisioning_args,
579+
placement_policy_label=(
580+
create_placement_policy_label(system)
581+
if is_placement_policy_supported(system)
582+
else ''
583+
),
561584
)
562585
else:
563586
container, debugging_dashboard_id = get_user_workload_container(
@@ -585,6 +608,11 @@ def workload_create(args) -> None:
585608
create_sub_slicing_annotations(args.sub_slicing_topology)
586609
)
587610
),
611+
placement_policy_label=(
612+
create_placement_policy_label(system)
613+
if is_placement_policy_supported(system)
614+
else ''
615+
),
588616
machine_label=create_machine_label(system.accelerator_type, system),
589617
local_queue_name=LOCAL_QUEUE_NAME,
590618
autoprovisioning_args=autoprovisioning_args,

src/xpk/core/nodepool.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
from typing import List
1818
from ..utils.console import ask_for_user_consent, xpk_print
19-
from ..utils.topology import is_topology_valid
19+
from .scheduling import get_placement_policy_name, is_placement_policy_supported
2020
from .capacity import (
2121
AUTOPROVISIONING_CONFIG_VALUE,
2222
H100_MEGA_DEVICE_TYPE,
@@ -258,10 +258,8 @@ def run_gke_node_pool_create_command(
258258
return 1
259259

260260
placement_args = ''
261-
if system.requires_workload_policy and is_topology_valid(system.topology):
262-
placement_policy = (
263-
f'{system.device_type}-{system.topology}-placement-policy'
264-
)
261+
if is_placement_policy_supported(system):
262+
placement_policy = get_placement_policy_name(system)
265263
ensure_resource_policy_exists(placement_policy, args, system.topology)
266264
placement_args = f' --placement-policy={placement_policy}'
267265

src/xpk/core/nodepool_test.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -146,21 +146,23 @@ def mock_nodepool_dependencies(mocker):
146146
)
147147
mocker.patch("xpk.core.nodepool.run_commands", return_value=0)
148148
mocker.patch("xpk.core.nodepool.ask_for_user_consent", return_value=True)
149-
mock_is_topology_valid = mocker.patch("xpk.core.nodepool.is_topology_valid")
149+
mock_is_placement_policy_supported = mocker.patch(
150+
"xpk.core.nodepool.is_placement_policy_supported"
151+
)
150152
mock_ensure_resource_policy = mocker.patch(
151153
"xpk.core.nodepool.ensure_resource_policy_exists"
152154
)
153-
return mock_is_topology_valid, mock_ensure_resource_policy
155+
return mock_is_placement_policy_supported, mock_ensure_resource_policy
154156

155157

156158
def test_placement_policy_created_for_gpu_with_valid_topology(
157159
mocker, mock_nodepool_dependencies
158160
):
159161
"""Tests that placement policy is created for GPUs with a valid topology."""
160-
mock_is_topology_valid, mock_ensure_resource_policy = (
162+
mock_is_placement_policy_supported, mock_ensure_resource_policy = (
161163
mock_nodepool_dependencies
162164
)
163-
mock_is_topology_valid.return_value = True
165+
mock_is_placement_policy_supported.return_value = True
164166
args = mocker.Mock(
165167
tpu_type=None,
166168
device_type="h100-80gb-8",
@@ -188,10 +190,10 @@ def test_placement_policy_not_created_for_gpu_with_invalid_topology(
188190
mocker, mock_nodepool_dependencies
189191
):
190192
"""Tests that placement policy is not created for GPUs with an invalid topology."""
191-
mock_is_topology_valid, mock_ensure_resource_policy = (
193+
mock_is_placement_policy_supported, mock_ensure_resource_policy = (
192194
mock_nodepool_dependencies
193195
)
194-
mock_is_topology_valid.return_value = False
196+
mock_is_placement_policy_supported.return_value = False
195197
args = mocker.Mock(
196198
tpu_type=None,
197199
device_type="h100-80gb-8",
@@ -218,10 +220,10 @@ def test_placement_policy_created_for_tpu7x_with_valid_topology(
218220
mocker, mock_nodepool_dependencies
219221
):
220222
"""Tests that placement policy is created for tpu7x with a valid topology."""
221-
mock_is_topology_valid, mock_ensure_resource_policy = (
223+
mock_is_placement_policy_supported, mock_ensure_resource_policy = (
222224
mock_nodepool_dependencies
223225
)
224-
mock_is_topology_valid.return_value = True
226+
mock_is_placement_policy_supported.return_value = True
225227
args = mocker.Mock(
226228
tpu_type="tpu7x-8",
227229
device_type=None,
@@ -251,10 +253,10 @@ def test_placement_policy_not_created_for_non7x_tpu(
251253
mocker, mock_nodepool_dependencies
252254
):
253255
"""Tests that placement policy is not created for non-tpu7x TPUs."""
254-
mock_is_topology_valid, mock_ensure_resource_policy = (
256+
mock_is_placement_policy_supported, mock_ensure_resource_policy = (
255257
mock_nodepool_dependencies
256258
)
257-
mock_is_topology_valid.return_value = True
259+
mock_is_placement_policy_supported.return_value = False
258260
args = mocker.Mock(
259261
tpu_type="v6e",
260262
device_type=None,

src/xpk/core/scheduling.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"""
1616

1717
from ..utils.console import xpk_print
18+
from ..utils.topology import is_topology_valid
1819
from ..utils.execution_context import is_dry_run
1920
from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
2021
from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap
@@ -303,3 +304,16 @@ def create_sub_slicing_annotations(sub_slicing_topology: str) -> list[str]:
303304
),
304305
f'cloud.google.com/gke-tpu-slice-topology: {sub_slicing_topology}',
305306
]
307+
308+
309+
def create_placement_policy_label(system: SystemCharacteristics) -> str:
310+
name = get_placement_policy_name(system)
311+
return f'cloud.google.com/placement-policy-name: {name}'
312+
313+
314+
def get_placement_policy_name(system: SystemCharacteristics) -> str:
315+
return f'{system.device_type}-{system.topology}-placement-policy'
316+
317+
318+
def is_placement_policy_supported(system: SystemCharacteristics) -> bool:
319+
return system.requires_workload_policy and is_topology_valid(system.topology)

src/xpk/core/scheduling_test.py

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
limitations under the License.
1515
"""
1616

17-
from .scheduling import create_sub_slicing_annotations
17+
from .scheduling import create_sub_slicing_annotations, create_placement_policy_label, get_placement_policy_name, is_placement_policy_supported
18+
from .system_characteristics import SystemCharacteristics, AcceleratorType
1819

1920

2021
def test_create_sub_slicing_annotations_returns_valid_annotations():
@@ -29,3 +30,83 @@ def test_create_sub_slicing_annotations_returns_valid_annotations():
2930
),
3031
'cloud.google.com/gke-tpu-slice-topology: 2x2',
3132
]
33+
34+
35+
def test_create_placement_policy_label_returns_valid_label():
36+
system_characteristics = SystemCharacteristics(
37+
chips_per_vm=1,
38+
gce_machine_type='tpu7x-standard-1t',
39+
gke_accelerator='tpu7x',
40+
requires_workload_policy=False,
41+
topology='1x1x1',
42+
vms_per_slice=1,
43+
device_type='tpu7x',
44+
accelerator_type=AcceleratorType.TPU,
45+
supports_sub_slicing=False,
46+
)
47+
label = create_placement_policy_label(system_characteristics)
48+
assert (
49+
label
50+
== 'cloud.google.com/placement-policy-name: tpu7x-1x1x1-placement-policy'
51+
)
52+
53+
54+
def test_get_placement_policy_name_returns_valid_name():
55+
system_characteristics = SystemCharacteristics(
56+
chips_per_vm=1,
57+
gce_machine_type='tpu7x-standard-1t',
58+
gke_accelerator='tpu7x',
59+
requires_workload_policy=False,
60+
topology='1x1x1',
61+
vms_per_slice=1,
62+
device_type='tpu7x',
63+
accelerator_type=AcceleratorType.TPU,
64+
supports_sub_slicing=False,
65+
)
66+
name = get_placement_policy_name(system_characteristics)
67+
assert name == 'tpu7x-1x1x1-placement-policy'
68+
69+
70+
def test_is_placement_policy_supported_returns_true_for_system_characteristics_supporting_workload_policy_and_having_valid_topology():
71+
system_characteristics = SystemCharacteristics(
72+
chips_per_vm=1,
73+
gce_machine_type='tpu7x-standard-1t',
74+
gke_accelerator='tpu7x',
75+
requires_workload_policy=True,
76+
topology='1x1x1',
77+
vms_per_slice=1,
78+
device_type='tpu7x',
79+
accelerator_type=AcceleratorType.TPU,
80+
supports_sub_slicing=False,
81+
)
82+
assert is_placement_policy_supported(system_characteristics) is True
83+
84+
85+
def test_is_placement_policy_supported_returns_false_for_system_characteristics_not_supporting_workload_policy_and_having_valid_topology():
86+
system_characteristics = SystemCharacteristics(
87+
chips_per_vm=1,
88+
gce_machine_type='tpu7x-standard-1t',
89+
gke_accelerator='tpu7x',
90+
requires_workload_policy=False,
91+
topology='1x1x1',
92+
vms_per_slice=1,
93+
device_type='tpu7x',
94+
accelerator_type=AcceleratorType.TPU,
95+
supports_sub_slicing=False,
96+
)
97+
assert is_placement_policy_supported(system_characteristics) is False
98+
99+
100+
def test_is_placement_policy_supported_returns_false_for_system_characteristics_supporting_workload_policy_and_having_invalid_topology():
101+
system_characteristics = SystemCharacteristics(
102+
chips_per_vm=1,
103+
gce_machine_type='tpu7x-standard-1t',
104+
gke_accelerator='tpu7x',
105+
requires_workload_policy=True,
106+
topology='aaa',
107+
vms_per_slice=1,
108+
device_type='tpu7x',
109+
accelerator_type=AcceleratorType.TPU,
110+
supports_sub_slicing=False,
111+
)
112+
assert is_placement_policy_supported(system_characteristics) is False

0 commit comments

Comments
 (0)