diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 99126db65..eec16bf82 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -50,7 +50,6 @@ install_kueue_crs, install_kueue_on_cluster, wait_for_kueue_available, - update_kueue_resources_if_necessary, ) from ..core.nap import enable_autoprovisioning_on_cluster from ..core.network import ( @@ -973,11 +972,6 @@ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config): if enable_kueue_credentials_code != 0: xpk_exit(enable_kueue_credentials_code) - xpk_print('Update Kueue Controller Manager resources') - update_kueue_resources_code = update_kueue_resources_if_necessary(args) - if update_kueue_resources_code != 0: - xpk_exit(update_kueue_resources_code) - def prepare_gpus(args, system: SystemCharacteristics): xpk_print('Installing NCCL Plugin for cluster') diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 8f69c40c8..30d33ca67 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -16,10 +16,9 @@ from argparse import Namespace -import math import packaging from packaging.version import Version - +from ..core.blueprint.blueprint_generator import cluster_toolkit_version from ..utils.console import xpk_exit, xpk_print from ..utils.file import write_tmp_file from .capacity import B200_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE @@ -190,92 +189,6 @@ command: [ "sleep", "inf" ] """ -kueue_controller_manager_yml = """ -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - app.kubernetes.io/component: controller - app.kubernetes.io/name: kueue - control-plane: controller-manager - name: kueue-controller-manager - namespace: kueue-system -spec: - replicas: 1 - selector: - matchLabels: - control-plane: controller-manager - template: - metadata: - annotations: - kubectl.kubernetes.io/default-container: manager - labels: - app.kubernetes.io/component: controller - app.kubernetes.io/name: kueue - control-plane: controller-manager - spec: - containers: - - args: - - --config=/controller_manager_config.yaml - - --zap-log-level=2 - command: - - /manager - image: registry.k8s.io/kueue/kueue:{KUEUE_VERSION} - imagePullPolicy: Always - livenessProbe: - httpGet: - path: /healthz - port: 8081 - initialDelaySeconds: 15 - periodSeconds: 20 - name: manager - ports: - - containerPort: 8082 - name: visibility - protocol: TCP - - containerPort: 9443 - name: webhook-server - protocol: TCP - readinessProbe: - httpGet: - path: /readyz - port: 8081 - initialDelaySeconds: 5 - periodSeconds: 10 - resources: - limits: - cpu: 500m - memory: {memory_limit_size} - requests: - cpu: 500m - memory: 512Mi - securityContext: - allowPrivilegeEscalation: false - volumeMounts: - - mountPath: /visibility - name: visibility - - mountPath: /tmp/k8s-webhook-server/serving-certs - name: cert - readOnly: true - - mountPath: /controller_manager_config.yaml - name: manager-config - subPath: controller_manager_config.yaml - securityContext: - runAsNonRoot: true - serviceAccountName: kueue-controller-manager - terminationGracePeriodSeconds: 10 - volumes: - - name: visibility - emptyDir: {{}} - - name: cert - secret: - defaultMode: 420 - secretName: kueue-webhook-server-cert - - configMap: - name: kueue-manager-config - name: manager-config -""" - def verify_kueuectl(args: Namespace) -> None: """Verify if kueuectl is installed. @@ -359,7 +272,8 @@ def install_kueue_on_cluster(args) -> int: command = ( 'kubectl apply --server-side --force-conflicts -f' - f' https://github.com/kubernetes-sigs/kueue/releases/download/{KUEUE_VERSION}/manifests.yaml' + f' https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/tags/{cluster_toolkit_version}' + f'/modules/management/kubectl-apply/manifests/kueue-{KUEUE_VERSION}.yaml' ) task = 'Set Kueue On Cluster' return_code = run_command_with_updates_retry(command, task, args) @@ -506,36 +420,3 @@ def get_kueue_covered_resources_config( total_chips=total_chips, ) return config_string - - -def update_kueue_resources_if_necessary(args): - """Update the kueue manifest to increase the resources for the kueue controller manager. - - Args: - args: user provided arguments for running the command. - - Returns: - 0 if successful and 1 otherwise. - """ - # Get total number of nodes - cmd_total_node_num = 'kubectl get node --no-headers | wc -l' - return_code, out = run_command_for_value( - cmd_total_node_num, 'Count total nodes', args - ) - if return_code != 0: - xpk_exit(1) - # 1.2MiB per VM or 4GiB (whichever is greater). - new_memory_limit = ( - f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi' - ) - yml_string = kueue_controller_manager_yml.format( - memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION - ) - tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' - - task = 'Updating Kueue Controller Manager resources' - return_code = run_command_with_updates_retry(command, task, args) - if return_code != 0: - xpk_print(f'{task} returned ERROR {return_code}') - return return_code