Skip to content

Commit bcea7c8

Browse files
committed
workaround for v5e testing
1 parent 2b7c5f5 commit bcea7c8

File tree

1 file changed

+47
-50
lines changed

1 file changed

+47
-50
lines changed

src/xpk/core/scheduling.py

Lines changed: 47 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -25,86 +25,83 @@
2525

2626

2727
def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
28-
"""Check if workload can schedule based on the cluster resources (tpu_type and maximum VM in cluster).
28+
"""Check if workload can schedule based on the cluster resources.
29+
30+
This function validates that the resource requested by the user exists in the
31+
cluster's resource manifest (a ConfigMap) and that the requested quantity
32+
(e.g., number of VMs) does not exceed the available quantity.
2933
3034
Args:
31-
args: user provided arguments for running the command.
32-
system: system characteristics
35+
args: User-provided arguments for running the command.
36+
system: System characteristics derived from the user's request.
3337
3438
Returns:
35-
returns true if workload can schedule, otherwise returns false.
39+
True if the workload can be scheduled, otherwise False.
3640
"""
3741
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
3842
cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
3943

40-
# Prevents workload creation failure for existing clusters with no ConfigMap
44+
# If no ConfigMap exists, we cannot validate, so we optimistically proceed.
45+
# This maintains compatibility with older cluster setups.
4146
if cluster_config_map is None:
4247
xpk_print(
43-
'No ConfigMap exist for cluster with the name'
44-
f' {resources_configmap_name}.'
48+
f"Warning: Could not find resource ConfigMap '{resources_configmap_name}'. "
49+
"Proceeding without resource validation."
4550
)
4651
return True
4752

48-
# Check for gke accelerator type:
49-
missing_gke_accelerator_type = False
50-
if not cluster_config_map.get(system.gke_accelerator):
53+
# The user-facing device type (e.g., 'v5litepod-32') is the single source
54+
# of truth for identifying the resource in the cluster's manifest.
55+
user_facing_device_type = args.tpu_type if args.tpu_type else args.device_type
56+
57+
# --- Primary Validation ---
58+
# Check if the cluster's resource manifest contains an entry for the exact
59+
# device type the user requested. This is the only reliable existence check.
60+
if user_facing_device_type not in cluster_config_map:
5161
xpk_print(
52-
f'Gke Accelerator Type Check: {args.workload} is requesting'
53-
f' {system.gke_accelerator} but cluster only contains'
54-
f' {cluster_config_map.keys()}. '
62+
f"Device Type Check Failed: Workload '{args.workload}' is requesting "
63+
f"device type '{user_facing_device_type}', but the cluster's resource "
64+
f"manifest only contains entries for: {list(cluster_config_map.keys())}. "
65+
"The cluster may not be provisioned with this hardware type."
5566
)
56-
missing_gke_accelerator_type = True
57-
elif (
58-
cluster_config_map[system.gke_accelerator]
67+
return False
68+
69+
# --- Quantity Validation ---
70+
71+
# Handle autoprovisioning capacity checks.
72+
if (
73+
cluster_config_map[user_facing_device_type]
5974
== AUTOPROVISIONING_CONFIG_VALUE
6075
):
61-
# Run total chip check when in autoprovisioning mode.
6276
max_chips_in_cluster = int(
63-
cluster_config_map[AUTOPROVISIONING_CONFIG_MAXIMUM_KEY]
77+
cluster_config_map.get(AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, 0)
6478
)
6579
num_chips_in_workload = get_total_chips_requested_from_args(args, system)
6680

6781
if num_chips_in_workload > max_chips_in_cluster:
6882
xpk_print(
69-
f'{args.workload} is requesting {num_chips_in_workload} chips but'
70-
f' the cluster {args.cluster} supports up to {max_chips_in_cluster}.'
71-
' Resize the cluster to support more chips with'
72-
' `xpk cluster create --autoprovisioning-max-chips=X ...`'
83+
f"Chip Request Exceeds Limit: Workload '{args.workload}' requests "
84+
f"{num_chips_in_workload} chips, but the autoprovisioning cluster "
85+
f"'{args.cluster}' is configured for a maximum of {max_chips_in_cluster} chips."
7386
)
7487
return False
75-
return True
88+
return True # For autoprovisioning, chip count is sufficient.
7689

77-
# Check for device type
78-
missing_device_type = False
79-
device_type = system.device_type
80-
if device_type not in cluster_config_map:
81-
xpk_print(
82-
f'Device Type Check: {args.workload} is requesting {device_type} but '
83-
f'cluster only contains {cluster_config_map.keys()}. '
84-
)
85-
missing_device_type = True
90+
# For statically-sized clusters, check if the number of requested VMs fits.
91+
max_vm_in_cluster = int(cluster_config_map[user_facing_device_type])
92+
if system.accelerator_type == AcceleratorType['GPU']:
93+
vm_required_by_workload = args.num_nodes
94+
else:
95+
vm_required_by_workload = args.num_slices * system.vms_per_slice
8696

87-
if missing_device_type and missing_gke_accelerator_type:
97+
if vm_required_by_workload > max_vm_in_cluster:
8898
xpk_print(
89-
'Both Device Type and GKE Accelerator Type checks failed.'
90-
f' XPK will not create the workload {args.workload}.'
99+
f"VM Request Exceeds Capacity: Workload '{args.workload}' requests "
100+
f"{vm_required_by_workload} VMs for {args.num_slices} slice(s) of type "
101+
f"'{user_facing_device_type}', but the cluster only has "
102+
f"{max_vm_in_cluster} VMs of that type available."
91103
)
92104
return False
93-
else:
94-
# Check if the size of the workload will fit in the cluster.
95-
max_vm_in_cluster = int(cluster_config_map[device_type])
96-
if system.accelerator_type == AcceleratorType['GPU']:
97-
vm_required_by_workload = args.num_nodes
98-
else:
99-
vm_required_by_workload = args.num_slices * system.vms_per_slice
100-
if vm_required_by_workload > max_vm_in_cluster:
101-
xpk_print(
102-
f'{args.workload} is requesting {args.num_slices} slice/slices of'
103-
f' {device_type}, which is {vm_required_by_workload} VMs, but the'
104-
f' cluster only contains {max_vm_in_cluster} VMs of {device_type}.'
105-
' XPK will not create this workload.'
106-
)
107-
return False
108105

109106
return True
110107

0 commit comments

Comments
 (0)