|
25 | 25 |
|
26 | 26 |
|
27 | 27 | def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
|
28 |
| - """Check if workload can schedule based on the cluster resources (tpu_type and maximum VM in cluster). |
| 28 | + """Check if workload can schedule based on the cluster resources. |
| 29 | +
|
| 30 | + This function validates that the resource requested by the user exists in the |
| 31 | + cluster's resource manifest (a ConfigMap) and that the requested quantity |
| 32 | + (e.g., number of VMs) does not exceed the available quantity. |
29 | 33 |
|
30 | 34 | Args:
|
31 |
| - args: user provided arguments for running the command. |
32 |
| - system: system characteristics |
| 35 | + args: User-provided arguments for running the command. |
| 36 | + system: System characteristics derived from the user's request. |
33 | 37 |
|
34 | 38 | Returns:
|
35 |
| - returns true if workload can schedule, otherwise returns false. |
| 39 | + True if the workload can be scheduled, otherwise False. |
36 | 40 | """
|
37 | 41 | resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
38 | 42 | cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
|
39 | 43 |
|
40 |
| - # Prevents workload creation failure for existing clusters with no ConfigMap |
| 44 | + # If no ConfigMap exists, we cannot validate, so we optimistically proceed. |
| 45 | + # This maintains compatibility with older cluster setups. |
41 | 46 | if cluster_config_map is None:
|
42 | 47 | xpk_print(
|
43 |
| - 'No ConfigMap exist for cluster with the name' |
44 |
| - f' {resources_configmap_name}.' |
| 48 | + f"Warning: Could not find resource ConfigMap '{resources_configmap_name}'. " |
| 49 | + "Proceeding without resource validation." |
45 | 50 | )
|
46 | 51 | return True
|
47 | 52 |
|
48 |
| - # Check for gke accelerator type: |
49 |
| - missing_gke_accelerator_type = False |
50 |
| - if not cluster_config_map.get(system.gke_accelerator): |
| 53 | + # The user-facing device type (e.g., 'v5litepod-32') is the single source |
| 54 | + # of truth for identifying the resource in the cluster's manifest. |
| 55 | + user_facing_device_type = args.tpu_type if args.tpu_type else args.device_type |
| 56 | + |
| 57 | + # --- Primary Validation --- |
| 58 | + # Check if the cluster's resource manifest contains an entry for the exact |
| 59 | + # device type the user requested. This is the only reliable existence check. |
| 60 | + if user_facing_device_type not in cluster_config_map: |
51 | 61 | xpk_print(
|
52 |
| - f'Gke Accelerator Type Check: {args.workload} is requesting' |
53 |
| - f' {system.gke_accelerator} but cluster only contains' |
54 |
| - f' {cluster_config_map.keys()}. ' |
| 62 | + f"Device Type Check Failed: Workload '{args.workload}' is requesting " |
| 63 | + f"device type '{user_facing_device_type}', but the cluster's resource " |
| 64 | + f"manifest only contains entries for: {list(cluster_config_map.keys())}. " |
| 65 | + "The cluster may not be provisioned with this hardware type." |
55 | 66 | )
|
56 |
| - missing_gke_accelerator_type = True |
57 |
| - elif ( |
58 |
| - cluster_config_map[system.gke_accelerator] |
| 67 | + return False |
| 68 | + |
| 69 | + # --- Quantity Validation --- |
| 70 | + |
| 71 | + # Handle autoprovisioning capacity checks. |
| 72 | + if ( |
| 73 | + cluster_config_map[user_facing_device_type] |
59 | 74 | == AUTOPROVISIONING_CONFIG_VALUE
|
60 | 75 | ):
|
61 |
| - # Run total chip check when in autoprovisioning mode. |
62 | 76 | max_chips_in_cluster = int(
|
63 |
| - cluster_config_map[AUTOPROVISIONING_CONFIG_MAXIMUM_KEY] |
| 77 | + cluster_config_map.get(AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, 0) |
64 | 78 | )
|
65 | 79 | num_chips_in_workload = get_total_chips_requested_from_args(args, system)
|
66 | 80 |
|
67 | 81 | if num_chips_in_workload > max_chips_in_cluster:
|
68 | 82 | xpk_print(
|
69 |
| - f'{args.workload} is requesting {num_chips_in_workload} chips but' |
70 |
| - f' the cluster {args.cluster} supports up to {max_chips_in_cluster}.' |
71 |
| - ' Resize the cluster to support more chips with' |
72 |
| - ' `xpk cluster create --autoprovisioning-max-chips=X ...`' |
| 83 | + f"Chip Request Exceeds Limit: Workload '{args.workload}' requests " |
| 84 | + f"{num_chips_in_workload} chips, but the autoprovisioning cluster " |
| 85 | + f"'{args.cluster}' is configured for a maximum of {max_chips_in_cluster} chips." |
73 | 86 | )
|
74 | 87 | return False
|
75 |
| - return True |
| 88 | + return True # For autoprovisioning, chip count is sufficient. |
76 | 89 |
|
77 |
| - # Check for device type |
78 |
| - missing_device_type = False |
79 |
| - device_type = system.device_type |
80 |
| - if device_type not in cluster_config_map: |
81 |
| - xpk_print( |
82 |
| - f'Device Type Check: {args.workload} is requesting {device_type} but ' |
83 |
| - f'cluster only contains {cluster_config_map.keys()}. ' |
84 |
| - ) |
85 |
| - missing_device_type = True |
| 90 | + # For statically-sized clusters, check if the number of requested VMs fits. |
| 91 | + max_vm_in_cluster = int(cluster_config_map[user_facing_device_type]) |
| 92 | + if system.accelerator_type == AcceleratorType['GPU']: |
| 93 | + vm_required_by_workload = args.num_nodes |
| 94 | + else: |
| 95 | + vm_required_by_workload = args.num_slices * system.vms_per_slice |
86 | 96 |
|
87 |
| - if missing_device_type and missing_gke_accelerator_type: |
| 97 | + if vm_required_by_workload > max_vm_in_cluster: |
88 | 98 | xpk_print(
|
89 |
| - 'Both Device Type and GKE Accelerator Type checks failed.' |
90 |
| - f' XPK will not create the workload {args.workload}.' |
| 99 | + f"VM Request Exceeds Capacity: Workload '{args.workload}' requests " |
| 100 | + f"{vm_required_by_workload} VMs for {args.num_slices} slice(s) of type " |
| 101 | + f"'{user_facing_device_type}', but the cluster only has " |
| 102 | + f"{max_vm_in_cluster} VMs of that type available." |
91 | 103 | )
|
92 | 104 | return False
|
93 |
| - else: |
94 |
| - # Check if the size of the workload will fit in the cluster. |
95 |
| - max_vm_in_cluster = int(cluster_config_map[device_type]) |
96 |
| - if system.accelerator_type == AcceleratorType['GPU']: |
97 |
| - vm_required_by_workload = args.num_nodes |
98 |
| - else: |
99 |
| - vm_required_by_workload = args.num_slices * system.vms_per_slice |
100 |
| - if vm_required_by_workload > max_vm_in_cluster: |
101 |
| - xpk_print( |
102 |
| - f'{args.workload} is requesting {args.num_slices} slice/slices of' |
103 |
| - f' {device_type}, which is {vm_required_by_workload} VMs, but the' |
104 |
| - f' cluster only contains {max_vm_in_cluster} VMs of {device_type}.' |
105 |
| - ' XPK will not create this workload.' |
106 |
| - ) |
107 |
| - return False |
108 | 105 |
|
109 | 106 | return True
|
110 | 107 |
|
|
0 commit comments